1 /*-
2  * Copyright (c) 1999, 2020 Oracle and/or its affiliates.  All rights reserved.
3  *
4  * See the file LICENSE for license information.
5  *
6  * $Id$
7  */
8 
9 #include "db_config.h"
10 
11 #include "db_int.h"
12 #include "dbinc/db_page.h"
13 #include "dbinc/btree.h"
14 #include "dbinc/hash.h"
15 #include "dbinc/lock.h"
16 #include "dbinc/mp.h"
17 #include "dbinc/txn.h"
18 #include "dbinc/fop.h"
19 #ifdef HAVE_FTRUNCATE
20 static int __db_free_freelist __P((DB *, DB_THREAD_INFO *, DB_TXN *));
21 static int __db_setup_freelist __P((DB *, db_pglist_t *, u_int32_t));
22 #endif
23 
24 #define	SAVE_START							\
25 	do {								\
26 		save_data = *c_data;					\
27 		ret = __db_retcopy(env,				\
28 		     &save_start, current.data, current.size,		\
29 		     &save_start.data, &save_start.ulen);		\
30 	} while (0)
31 
32 /*
33  * Only restore those things that are negated by aborting the
34  * transaction.  We don't restore the number of deadlocks, for example.
35  */
36 
37 #define	RESTORE_START							\
38 	do {								\
39 		c_data->compact_pages_free =				\
40 		      save_data.compact_pages_free;			\
41 		c_data->compact_levels = save_data.compact_levels;	\
42 		c_data->compact_truncate = save_data.compact_truncate;	\
43 		c_data->compact_empty_buckets =				\
44 		    save_data.compact_empty_buckets;			\
45 		ret = __db_retcopy(env, &current,			\
46 		     save_start.data, save_start.size,			\
47 		     &current.data, &current.ulen);			\
48 	} while (0)
49 
50 /*
51  * __db_compact_int -- compact a database.
52  *
53  * PUBLIC: int __db_compact_int __P((DB *, DB_THREAD_INFO *, DB_TXN *,
54  * PUBLIC:     DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
55  */
56 int
__db_compact_int(dbp,ip,txn,start,stop,c_data,flags,end)57 __db_compact_int(dbp, ip, txn, start, stop, c_data, flags, end)
58 	DB *dbp;
59 	DB_THREAD_INFO *ip;
60 	DB_TXN *txn;
61 	DBT *start, *stop;
62 	DB_COMPACT *c_data;
63 	u_int32_t flags;
64 	DBT *end;
65 {
66 	DBC *dbc;
67 	DBT current, save_start;
68 	DB_COMPACT save_data;
69 	DB_TXN *txn_orig;
70 	ENV *env;
71 	u_int32_t factor, retry;
72 	int deadlock, have_freelist, isdone, ret, span, t_ret, txn_local;
73 
74 #ifdef HAVE_HASH
75 	u_int32_t empty_buckets;
76 #endif
77 
78 #ifdef HAVE_FTRUNCATE
79 	db_pglist_t *list;
80 	db_pgno_t last_pgno;
81 	u_int32_t nelems, truncated;
82 #endif
83 	env = dbp->env;
84 
85 	memset(&current, 0, sizeof(current));
86 	memset(&save_start, 0, sizeof(save_start));
87 	dbc = NULL;
88 	factor = 0;
89 	have_freelist = deadlock = isdone = span = 0;
90 	ret = retry = 0;
91 	txn_orig = txn;
92 
93 #ifdef HAVE_FTRUNCATE
94 	list = NULL;
95 	last_pgno = 0;
96 	nelems = truncated = 0;
97 #endif
98 
99 	/*
100 	 * We pass "current" to the internal routine, indicating where that
101 	 * routine should begin its work and expecting that it will return to
102 	 * us the last key that it processed.
103 	 */
104 	if (start != NULL && (ret = __db_retcopy(env,
105 	     &current, start->data, start->size,
106 	     &current.data, &current.ulen)) != 0)
107 		return (ret);
108 #ifdef HAVE_HASH
109 	empty_buckets = c_data->compact_empty_buckets;
110 #endif
111 
112 	if (IS_DB_AUTO_COMMIT(dbp, txn)) {
113 		txn_local = 1;
114 		LF_SET(DB_AUTO_COMMIT);
115 	} else
116 		txn_local = 0;
117 	if (!LF_ISSET(DB_FREE_SPACE | DB_FREELIST_ONLY))
118 		goto no_free;
119 	if (LF_ISSET(DB_FREELIST_ONLY))
120 		LF_SET(DB_FREE_SPACE);
121 
122 #ifdef HAVE_FTRUNCATE
123 	/* Sort the freelist and set up the in-memory list representation. */
124 	if (txn_local && (ret = __txn_begin(env, ip, txn_orig, &txn, 0)) != 0)
125 		goto err;
126 
127 	if ((ret = __db_free_truncate(dbp, ip,
128 	     txn, flags, c_data, &list, &nelems, &last_pgno)) != 0) {
129 		LF_CLR(DB_FREE_SPACE);
130 		goto terr;
131 	}
132 
133 	/* If the freelist is empty and we are not filling, get out. */
134 	if (nelems == 0 && LF_ISSET(DB_FREELIST_ONLY)) {
135 		ret = 0;
136 		LF_CLR(DB_FREE_SPACE);
137 		goto terr;
138 	}
139 	if ((ret = __db_setup_freelist(dbp, list, nelems)) != 0) {
140 		/* Someone else owns the free list. */
141 		if (ret == EBUSY)
142 			ret = 0;
143 	}
144 	if (ret == 0)
145 		have_freelist = 1;
146 
147 	/* Commit the txn and release the meta page lock. */
148 terr:	if (txn_local) {
149 		if ((t_ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0 && ret == 0)
150 			ret = t_ret;
151 		txn = NULL;
152 	}
153 	if (ret != 0)
154 		goto err;
155 
156 	/* Save the number truncated so far, we will add what we get below. */
157 	truncated = c_data->compact_pages_truncated;
158 	if (LF_ISSET(DB_FREELIST_ONLY))
159 		goto done;
160 #endif
161 
162 	/*
163 	 * We want factor to be the target number of free bytes on each page,
164 	 * so we know when to stop adding items to a page.   Make sure to
165 	 * subtract the page overhead when computing this target.  This can
166 	 * result in a 1-2% error on the smallest page.
167 	 * First figure out how many bytes we should use:
168 	 */
169 no_free:
170 	factor = dbp->pgsize - SIZEOF_PAGE;
171 	if (c_data->compact_fillpercent != 0) {
172 		factor *= c_data->compact_fillpercent;
173 		factor /= 100;
174 	}
175 	/* Now convert to the number of free bytes to target. */
176 	factor = (dbp->pgsize - SIZEOF_PAGE) - factor;
177 
178 	if (c_data->compact_pages == 0)
179 		c_data->compact_pages = DB_MAX_PAGES;
180 
181 	do {
182 		deadlock = 0;
183 
184 		SAVE_START;
185 		if (ret != 0)
186 			break;
187 
188 		if (txn_local) {
189 			if ((ret =
190 			    __txn_begin(env, ip, txn_orig, &txn, 0)) != 0)
191 				break;
192 
193 			if (c_data->compact_timeout != 0 &&
194 			    (ret = __txn_set_timeout(txn,
195 			    c_data->compact_timeout, DB_SET_LOCK_TIMEOUT)) != 0)
196 				goto err;
197 		}
198 
199 		if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
200 			goto err;
201 
202 #ifdef HAVE_HASH
203 		if (dbp->type == DB_HASH)
204 			ret = __ham_compact_int(dbc,
205 			    &current, stop, factor, c_data, &isdone, flags);
206 		else
207 #endif
208 			ret = __bam_compact_int(dbc, &current, stop, factor,
209 			     &span, c_data, &isdone);
210 		if (ret == DB_LOCK_DEADLOCK && txn_local) {
211 			/*
212 			 * We retry on deadlock.  Cancel the statistics
213 			 * and reset the start point to before this
214 			 * iteration.
215 			 */
216 			deadlock = 1;
217 			c_data->compact_deadlock++;
218 			RESTORE_START;
219 		}
220 		/*
221 		 * If we could not get a lock while holding an internal
222 		 * node latched, commit the current local transaction otherwise
223 		 * report a deadlock.
224 		 */
225 		if (ret == DB_LOCK_NOTGRANTED) {
226 			if (txn_local || retry++ < 5)
227 				ret = 0;
228 			else
229 				ret = DB_LOCK_DEADLOCK;
230 		} else
231 			retry = 0;
232 
233 		if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
234 			ret = t_ret;
235 
236 err:		if (txn_local && txn != NULL) {
237 			if (ret == 0 && deadlock == 0)
238 				ret = __txn_commit(txn, DB_TXN_NOSYNC);
239 			else if ((t_ret = __txn_abort(txn)) != 0 && ret == 0)
240 				ret = t_ret;
241 			txn = NULL;
242 		}
243 		DB_ASSERT(env, ip == NULL || ip->dbth_pincount == 0);
244 	} while (ret == 0 && !isdone);
245 
246 	if (ret == 0 && end != NULL)
247 		ret = __db_retcopy(env, end, current.data, current.size,
248 		    &end->data, &end->ulen);
249 	if (current.data != NULL)
250 		__os_free(env, current.data);
251 	if (save_start.data != NULL)
252 		__os_free(env, save_start.data);
253 
254 #ifdef HAVE_FTRUNCATE
255 	/*
256 	 * Finish up truncation work.  If there are pages left in the free
257 	 * list we can try to move the internal structures around so that we
258 	 * can remove more pages from the file.
259 	 * For BTREE search the internal nodes of the tree as we may have
260 	 * missed some while walking the leaf nodes.
261 	 * For HASH we will compact the hash table itself, moving segments
262 	 * to lower number pages where possible.
263 	 * Then calculate how many pages we have truncated and release
264 	 * the in-memory free list.
265 	 */
266 done:	if (LF_ISSET(DB_FREE_SPACE)) {
267 		DBMETA *meta;
268 		db_pgno_t pgno;
269 		int pgs_done;
270 
271 		pgno = PGNO_BASE_MD;
272 		isdone = 1;
273 		pgs_done = 0;
274 		if (ret == 0 && !LF_ISSET(DB_FREELIST_ONLY) &&
275 		    __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &meta) == 0) {
276 			isdone = meta->free == PGNO_INVALID;
277 			ret = __memp_fput(dbp->mpf, ip, meta, dbp->priority);
278 		}
279 
280 #ifdef HAVE_HASH
281 		if (dbp->type == DB_HASH) {
282 			c_data->compact_empty_buckets -= empty_buckets;
283 			if (!isdone || c_data->compact_empty_buckets != 0)
284 				ret = __ham_compact_hash(dbp,
285 				    ip, txn_orig, c_data);
286 			c_data->compact_empty_buckets += empty_buckets;
287 		} else
288 #endif
289 		if (!isdone)
290 			ret = __bam_truncate_ipages(dbp,
291 			    ip, txn_orig, c_data, &pgs_done);
292 
293 		/* Clean up the free list. */
294 		if (list != NULL)
295 			__os_free(env, list);
296 
297 		if ((t_ret =
298 		    __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &meta)) == 0) {
299 			c_data->compact_pages_truncated =
300 			    truncated + last_pgno - meta->last_pgno;
301 			if ((t_ret = __memp_fput(dbp->mpf, ip,
302 			    meta, dbp->priority)) != 0 && ret == 0)
303 				ret = t_ret;
304 		} else if (ret == 0)
305 			ret = t_ret;
306 
307 		if (have_freelist && (t_ret =
308 		    __db_free_freelist(dbp, ip, txn_orig)) != 0 && ret == 0)
309 			t_ret = ret;
310 	}
311 #endif
312 
313 	return (ret);
314 }
315 
316 #ifdef HAVE_FTRUNCATE
317 static int
__db_setup_freelist(dbp,list,nelems)318 __db_setup_freelist(dbp, list, nelems)
319 	DB *dbp;
320 	db_pglist_t *list;
321 	u_int32_t nelems;
322 {
323 	DB_MPOOLFILE *mpf;
324 	db_pgno_t *plist;
325 	int ret;
326 
327 	mpf = dbp->mpf;
328 
329 	if ((ret = __memp_alloc_freelist(mpf, nelems, &plist)) != 0)
330 		return (ret);
331 
332 	while (nelems-- != 0)
333 		*plist++ = list++->pgno;
334 
335 	return (0);
336 }
337 
338 static int
__db_free_freelist(dbp,ip,txn)339 __db_free_freelist(dbp, ip, txn)
340 	DB *dbp;
341 	DB_THREAD_INFO *ip;
342 	DB_TXN *txn;
343 {
344 	DBC *dbc;
345 	DB_LOCK lock;
346 	int auto_commit, ret, t_ret;
347 
348 	LOCK_INIT(lock);
349 	auto_commit = ret = 0;
350 
351 	/*
352 	 * If we are not in a transaction then we need to get
353 	 * a lock on the meta page, otherwise we should already
354 	 * have the lock.
355 	 */
356 
357 	dbc = NULL;
358 	if (IS_DB_AUTO_COMMIT(dbp, txn)) {
359 		/*
360 		 * We must not timeout the lock or we will not free the list.
361 		 * We ignore errors from txn_begin as there is little that
362 		 * the application can do with the error and we want to
363 		 * get the lock and free the list if at all possible.
364 		 */
365 		if (__txn_begin(dbp->env, ip, txn, &txn, 0) == 0) {
366 			(void)__lock_set_timeout(dbp->env,
367 			    txn->locker, 0, DB_SET_TXN_TIMEOUT);
368 			(void)__lock_set_timeout(dbp->env,
369 			    txn->locker, 0, DB_SET_LOCK_TIMEOUT);
370 			auto_commit = 1;
371 		}
372 		/* Get a cursor so we can call __db_lget. */
373 		if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
374 			return (ret);
375 
376 		if ((ret = __db_lget(dbc,
377 		     0, PGNO_BASE_MD, DB_LOCK_WRITE, 0, &lock)) != 0)
378 			goto err;
379 	}
380 
381 	ret = __memp_free_freelist(dbp->mpf);
382 
383 err:	if (dbc != NULL && (t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
384 		ret = t_ret;
385 
386 	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
387 		ret = t_ret;
388 
389 	if (auto_commit && (t_ret = __txn_abort(txn)) != 0 && ret == 0)
390 		ret = t_ret;
391 
392 	return (ret);
393 }
394 #endif
395 
396 /*
397  * __db_exchange_page -- try to move a page 'down', to earlier in the file.
398  *
399  * This tries to move a page to a lower location the file, by swapping it
400  * with an earlier free page. The free page comes either from the free list or
401  * the newpgno parameter (e.g., __ham_compact_hash()).  If the new page turns
402  * out to be higher than the original one, the allocation is undone and
403  * the caller is left unchanged.  After a successful swap, this routine can
404  * optionally free the old, higher numbered page.
405  * The cursor's stack includes at least the immediate parent of this page.
406  *
407  * PUBLIC: int __db_exchange_page
408  * PUBLIC:    __P((DBC *, PAGE **, PAGE *, db_pgno_t, int, int *));
409  */
410 int
__db_exchange_page(dbc,pgp,opg,newpgno,flags,pgs_donep)411 __db_exchange_page(dbc, pgp, opg, newpgno, flags, pgs_donep)
412 	DBC *dbc;
413 	PAGE **pgp, *opg;
414 	db_pgno_t newpgno;
415 	int flags;
416 	int *pgs_donep;
417 {
418 	BTREE_CURSOR *cp;
419 	DB *dbp;
420 	DBT data, *dp, hdr;
421 	DB_LSN lsn;
422 	DB_LOCK lock;
423 	EPG *epg;
424 	PAGE *newpage;
425 	db_pgno_t oldpgno, *pgnop;
426 	int ret;
427 
428 	COMPQUIET(oldpgno, 0);
429 
430 	DB_ASSERT(NULL, dbc != NULL);
431 	dbp = dbc->dbp;
432 	LOCK_INIT(lock);
433 
434 	/*
435 	 * We want to free a page that lives in the part of the file that
436 	 * can be truncated, so we're going to move it onto a free page
437 	 * that is in the part of the file that need not be truncated.
438 	 * In the case of compacting hash table segments the caller already
439 	 * identified a contiguous set of pages to use.  Otherwise
440 	 * since the freelist is ordered now, we can simply call __db_new
441 	 * which will grab the first element off the freelist; we know this
442 	 * is the lowest numbered free page.
443 	 */
444 	if (newpgno != PGNO_INVALID) {
445 		if ((ret = __memp_fget(dbp->mpf, &newpgno,
446 		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &newpage)) != 0)
447 			return (ret);
448 	} else if ((ret = __db_new(dbc, P_DONTEXTEND | TYPE(*pgp),
449 	     STD_LOCKING(dbc) && TYPE(*pgp) != P_OVERFLOW ? &lock : NULL,
450 	     &newpage)) != 0)
451 		return (ret);
452 
453 	/*
454 	 * If newpage is null then __db_new would have had to allocate
455 	 * a new page from the filesystem, so there is no reason
456 	 * to continue this action.
457 	 */
458 	if (newpage == NULL)
459 		return (0);
460 
461 	/*
462 	 * It is possible that a higher page is allocated if other threads
463 	 * are allocating at the same time, if so, just put it back.
464 	 */
465 	if (PGNO(newpage) > PGNO(*pgp)) {
466 		/* It is unfortunate but you can't just free a new overflow. */
467 		/* !!! Is the above comment still true? */
468 		/* !!! Should __db_new(OVERFLOW) zero OV_LEN()? */
469 		if (TYPE(newpage) == P_OVERFLOW)
470 			OV_LEN(newpage) = 0;
471 		if ((ret = __LPUT(dbc, lock)) != 0)
472 			return (ret);
473 		return (__db_free(dbc, newpage, 0));
474 	}
475 
476 	/* Log if necessary. */
477 	if (DBC_LOGGING(dbc)) {
478 		memset(&hdr, 0, sizeof(hdr));
479 		hdr.data = *pgp;
480 		hdr.size = P_OVERHEAD(dbp);
481 		memset(&data, 0, sizeof(data));
482 		dp = &data;
483 		switch (TYPE(*pgp)) {
484 		case P_OVERFLOW:
485 			data.data = (u_int8_t *)*pgp + P_OVERHEAD(dbp);
486 			data.size = OV_LEN(*pgp);
487 			break;
488 		case P_BTREEMETA:
489 			hdr.size = sizeof(BTMETA);
490 			dp = NULL;
491 			break;
492 		case P_HASHMETA:
493 			hdr.size = sizeof(HMETA);
494 			dp = NULL;
495 			break;
496 		default:
497 			data.data = (u_int8_t *)*pgp + HOFFSET(*pgp);
498 			data.size = dbp->pgsize - HOFFSET(*pgp);
499 			hdr.size += NUM_ENT(*pgp) * sizeof(db_indx_t);
500 		}
501 		if ((ret = __db_merge_log(dbp, dbc->txn,
502 		      &LSN(newpage), 0, PGNO(newpage), &LSN(newpage),
503 		      PGNO(*pgp), &LSN(*pgp), &hdr, dp, 1)) != 0)
504 			goto err;
505 	} else
506 		LSN_NOT_LOGGED(LSN(newpage));
507 
508 	oldpgno = PGNO(*pgp);
509 	newpgno = PGNO(newpage);
510 	lsn = LSN(newpage);
511 	memcpy(newpage, *pgp, dbp->pgsize);
512 	PGNO(newpage) = newpgno;
513 	LSN(newpage) = lsn;
514 
515 	/* Empty the old page. */
516 	if ((ret = __memp_dirty(dbp->mpf,
517 	    pgp, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
518 		goto err;
519 	if (TYPE(*pgp) == P_OVERFLOW)
520 		OV_LEN(*pgp) = 0;
521 	else {
522 		HOFFSET(*pgp) = dbp->pgsize;
523 		NUM_ENT(*pgp) = 0;
524 	}
525 	LSN(*pgp) = lsn;
526 
527 	/* Update siblings. */
528 	switch (TYPE(newpage)) {
529 	case P_OVERFLOW:
530 	case P_LBTREE:
531 	case P_LRECNO:
532 	case P_LDUP:
533 	case P_HASH:
534 		if (NEXT_PGNO(newpage) == PGNO_INVALID &&
535 		    PREV_PGNO(newpage) == PGNO_INVALID)
536 			break;
537 		if ((ret = __db_relink(dbc, *pgp, opg, PGNO(newpage))) != 0)
538 			goto err;
539 		break;
540 	default:
541 		break;
542 	}
543 
544 	/*
545 	 * For HASH we may reuse the old page for an even higher numbered
546 	 * page.  Otherwise we free the old page.
547 	 */
548 	if (!LF_ISSET(DB_EXCH_FREE)) {
549 		NEXT_PGNO(*pgp) = PREV_PGNO(*pgp) = PGNO_INVALID;
550 		ret = __memp_fput(dbp->mpf,
551 		    dbc->thread_info, *pgp, dbc->priority);
552 	} else
553 		ret = __db_free(dbc, *pgp, 0);
554 	*pgp = newpage;
555 
556 	if (ret != 0)
557 		return (ret);
558 
559 	if (!LF_ISSET(DB_EXCH_PARENT))
560 		goto done;
561 
562 	/* Update the parent. */
563 	cp = (BTREE_CURSOR *)dbc->internal;
564 	epg = &cp->csp[-1];
565 
566 	switch (TYPE(epg->page)) {
567 	case P_IBTREE:
568 		pgnop = &GET_BINTERNAL(dbp, epg->page, epg->indx)->pgno;
569 		break;
570 	case P_IRECNO:
571 		pgnop = &GET_RINTERNAL(dbp, epg->page, epg->indx)->pgno;
572 		break;
573 	case P_LBTREE:
574 	case P_LRECNO:
575 	case P_LDUP:
576 		pgnop = &GET_BOVERFLOW(dbp, epg->page, epg->indx)->pgno;
577 		break;
578 	default:
579 		return (__db_pgfmt(dbp->env, PGNO(epg->page)));
580 	}
581 	DB_ASSERT(dbp->env, oldpgno == *pgnop);
582 	if (DBC_LOGGING(dbc)) {
583 		if ((ret = __db_pgno_log(dbp, dbc->txn, &LSN(epg->page),
584 		    0, PGNO(epg->page), &LSN(epg->page), (u_int32_t)epg->indx,
585 		    *pgnop, PGNO(newpage))) != 0)
586 			return (ret);
587 	} else
588 		LSN_NOT_LOGGED(LSN(epg->page));
589 
590 	*pgnop = PGNO(newpage);
591 	cp->csp->page = newpage;
592 	if ((ret = __TLPUT(dbc, lock)) != 0)
593 		return (ret);
594 
595 done:
596 	(*pgs_donep)++;
597 	return (0);
598 
599 err:	(void)__memp_fput(dbp->mpf, dbc->thread_info, newpage, dbc->priority);
600 	(void)__TLPUT(dbc, lock);
601 	return (ret);
602 }
603 
604 /*
605  * __db_truncate_overflow -- find overflow pages to truncate.
606  *	Walk the pages of an overflow chain and swap out
607  * high numbered pages.  We are passed the first page
608  * but only deal with the second and subsequent pages.
609  * PUBLIC: int __db_truncate_overflow __P((DBC *, db_pgno_t,
610  * PUBLIC:    PAGE **, DB_COMPACT *, int *));
611  */
612 int
__db_truncate_overflow(dbc,pgno,ppg,c_data,pgs_donep)613 __db_truncate_overflow(dbc, pgno, ppg, c_data, pgs_donep)
614 	DBC *dbc;
615 	db_pgno_t pgno;
616 	PAGE **ppg;
617 	DB_COMPACT *c_data;
618 	int *pgs_donep;
619 {
620 	DB *dbp;
621 	DB_LOCK lock;
622 	PAGE *page;
623 	db_pgno_t ppgno;
624 	int have_lock, ret, t_ret;
625 
626 	dbp = dbc->dbp;
627 	page = NULL;
628 	LOCK_INIT(lock);
629 	have_lock = ppg == NULL;
630 
631 	if ((ret = __memp_fget(dbp->mpf, &pgno,
632 	     dbc->thread_info, dbc->txn, 0, &page)) != 0)
633 		return (ret);
634 
635 	while ((pgno = NEXT_PGNO(page)) != PGNO_INVALID) {
636 		if ((ret = __memp_fput(dbp->mpf,
637 		     dbc->thread_info, page, dbc->priority)) != 0)
638 			return (ret);
639 		if ((ret = __memp_fget(dbp->mpf, &pgno,
640 		    dbc->thread_info, dbc->txn, 0, &page)) != 0)
641 			return (ret);
642 		if (pgno <= c_data->compact_truncate)
643 			continue;
644 		if (!have_lock) {
645 			DB_ASSERT(dbp->env, ppg != NULL);
646 			ppgno = PGNO(*ppg);
647 			if ((ret = __memp_fput(dbp->mpf, dbc->thread_info,
648 			     *ppg, dbc->priority)) != 0)
649 				goto err;
650 			*ppg = NULL;
651 			if ((ret = __db_lget(dbc, 0, ppgno,
652 			     DB_LOCK_WRITE, 0, &lock)) != 0)
653 				goto err;
654 			if ((ret = __memp_fget(dbp->mpf, &ppgno,
655 			    dbc->thread_info,
656 			    dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0)
657 				goto err;
658 			have_lock = 1;
659 		}
660 		if ((ret = __db_exchange_page(dbc,
661 		    &page, NULL, PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0)
662 			break;
663 	}
664 
665 err:	if (page != NULL &&
666 	    (t_ret = __memp_fput(dbp->mpf,
667 	    dbc->thread_info, page, dbc->priority)) != 0 && ret == 0)
668 		ret = t_ret;
669 	if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
670 		ret = t_ret;
671 	return (ret);
672 }
673 
674 /*
675  * __db_truncate_root -- swap a root page for a lower numbered page.
676  * PUBLIC: int __db_truncate_root __P((DBC *,
677  * PUBLIC:      PAGE *, u_int32_t, db_pgno_t *, u_int32_t, int *));
678  */
679 int
__db_truncate_root(dbc,ppg,indx,pgnop,tlen,pgs_donep)680 __db_truncate_root(dbc, ppg, indx, pgnop, tlen, pgs_donep)
681 	DBC *dbc;
682 	PAGE *ppg;
683 	u_int32_t indx;
684 	db_pgno_t *pgnop;
685 	u_int32_t tlen;
686 	int *pgs_donep;
687 {
688 	DB *dbp;
689 	DBT orig;
690 	PAGE *page;
691 	int ret, t_ret;
692 	db_pgno_t newpgno;
693 
694 	dbp = dbc->dbp;
695 
696 	DB_ASSERT(dbc->dbp->env, IS_DIRTY(ppg));
697 	if ((ret = __memp_fget(dbp->mpf, pgnop,
698 	     dbc->thread_info, dbc->txn, 0, &page)) != 0)
699 		goto err;
700 
701 	/*
702 	 * If this is a multiply reference overflow key, then we will just
703 	 * copy it and decrement the reference count.  This is part of a
704 	 * fix to get rid of multiple references.
705 	 */
706 	if (TYPE(page) == P_OVERFLOW && OV_REF(page) > 1) {
707 		COMPQUIET(newpgno, 0);
708 		if ((ret = __db_ovref(dbc, *pgnop)) != 0)
709 			goto err;
710 		memset(&orig, 0, sizeof(orig));
711 		if ((ret = __db_goff(dbc, &orig, tlen, *pgnop,
712 		    &orig.data, &orig.size)) == 0)
713 			ret = __db_poff(dbc, &orig, &newpgno);
714 		if (orig.data != NULL)
715 			__os_free(dbp->env, orig.data);
716 		if (ret != 0)
717 			goto err;
718 	} else {
719 		LOCK_CHECK_OFF(dbc->thread_info);
720 		ret = __db_exchange_page(dbc,
721 		    &page, NULL, PGNO_INVALID, DB_EXCH_FREE, pgs_donep);
722 		LOCK_CHECK_ON(dbc->thread_info);
723 		if (ret != 0)
724 			goto err;
725 		newpgno = PGNO(page);
726 		/* If we could not allocate from the free list, give up.*/
727 		if (newpgno == *pgnop)
728 			goto err;
729 	}
730 
731 	/* Update the reference. */
732 	if (DBC_LOGGING(dbc)) {
733 		if ((ret = __db_pgno_log(dbp, dbc->txn, &LSN(ppg), 0, PGNO(ppg),
734 		     &LSN(ppg), (u_int32_t)indx, *pgnop, newpgno)) != 0)
735 			goto err;
736 	} else
737 		LSN_NOT_LOGGED(LSN(ppg));
738 
739 	*pgnop = newpgno;
740 
741 err:	if (page != NULL && (t_ret =
742 	      __memp_fput(dbp->mpf, dbc->thread_info,
743 		  page, dbc->priority)) != 0 && ret == 0)
744 		ret = t_ret;
745 	return (ret);
746 }
747 
748 #ifdef HAVE_FTRUNCATE
749 /*
750  * __db_find_free --
751  *	Find a contiguous "size" range of free pages that are lower numbers
752  * than the pages starting at "bstart".  We can also return a set of pages
753  * that overlaps with the pages at "bstart".
754  * PUBLIC: int __db_find_free __P((DBC *, u_int32_t,
755  * PUBLIC:	u_int32_t, db_pgno_t, db_pgno_t *));
756  */
757 int
__db_find_free(dbc,type,size,bstart,freep)758 __db_find_free(dbc, type, size, bstart, freep)
759 	DBC *dbc;
760 	u_int32_t type;
761 	u_int32_t size;
762 	db_pgno_t bstart, *freep;
763 {
764 	DB *dbp;
765 	DBMETA *meta;
766 	DBT listdbt;
767 	DB_LOCK metalock;
768 	DB_LSN lsn;
769 	DB_MPOOLFILE *mpf;
770 	PAGE *page, *freepg;
771 	u_int32_t i, j, start, nelems;
772 	db_pgno_t *list, next_free, pgno;
773 	db_pglist_t *lp, *pglist;
774 	int hash, ret, t_ret;
775 
776 	dbp = dbc->dbp;
777 	mpf = dbp->mpf;
778 	nelems = 0;
779 	hash = 0;
780 	page = NULL;
781 	pglist = NULL;
782 	meta = NULL;
783 	LOCK_INIT(metalock);
784 
785 #ifdef HAVE_HASH
786 	if (dbp->type == DB_HASH) {
787 		if ((ret = __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
788 			return (ret);
789 		if (meta != NULL)
790 			hash = 1;
791 	}
792 #endif
793 	if (meta == NULL) {
794 		pgno = PGNO_BASE_MD;
795 		if ((ret = __db_lget(dbc,
796 		    LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
797 			goto err;
798 		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
799 		    DB_MPOOL_DIRTY, &meta)) != 0)
800 			goto err;
801 	}
802 
803 	if ((ret = __memp_get_freelist(mpf, &nelems, &list)) != 0)
804 		goto err;
805 
806 	if (nelems == 0) {
807 		ret = DBC_ERR(dbc, DB_NOTFOUND);
808 		goto err;
809 	}
810 
811 	for (i = 0; i < nelems; i++) {
812 		if (list[i] > bstart) {
813 			ret = DBC_ERR(dbc, DB_NOTFOUND);
814 			goto err;
815 		}
816 		start = i;
817 		if (size == 1)
818 			goto found;
819 		while (i < nelems - 1 && list[i] + 1 == list[i + 1]) {
820 			i++;
821 			if (i - start == size - 1)
822 				goto found;
823 		}
824 		if (i - start == size - 1)
825 			goto found;
826 		/*
827 		 * If the last set of contiguous free pages we found
828 		 * are contiguous to the chunk we are trying to move,
829 		 * then we can slide the allocated chunk back some number
830 		 * of pages -- figure out how many by calculating the
831 		 * number of pages before the allocated ones that we have
832 		 * found in the free list.
833 		 */
834 		if (list[i] == bstart - 1) {
835 			size = (i - start) + 1;
836 			goto found;
837 		}
838 	}
839 	ret = DBC_ERR(dbc, DB_NOTFOUND);
840 	goto err;
841 
842 found:	/* We have size range of pages.  Remove them. */
843 	next_free = i == nelems - 1 ? PGNO_INVALID : list[i + 1];
844 	*freep = list[start];
845 	if (start == 0) {
846 		page = (PAGE *)meta;
847 	} else if ((ret = __memp_fget(mpf, &list[start - 1],
848 	     dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &page)) != 0)
849 		return (ret);
850 	if (DBC_LOGGING(dbc)) {
851 		if ((ret = __os_malloc(dbp->env,
852 		    size * sizeof(db_pglist_t), &pglist)) != 0)
853 			goto err;
854 		lp = pglist;
855 		for (j = start; j < start + size; j++, lp++) {
856 			if ((ret = __memp_fget(mpf, &list[j],
857 			    dbc->thread_info, dbc->txn, 0, &freepg)) != 0)
858 				goto err;
859 			lp->pgno = PGNO(freepg);
860 			lp->next_pgno = NEXT_PGNO(freepg);
861 			lp->lsn = LSN(freepg);
862 			if ((ret = __memp_fput(mpf,
863 			    dbc->thread_info, freepg, dbc->priority)) != 0)
864 				goto err;
865 		}
866 		listdbt.size = size * sizeof(*pglist);
867 		listdbt.data = pglist;
868 		if ((ret = __db_realloc_log(dbp, dbc->txn, &lsn, 0,
869 		    PGNO(page), &LSN(page), next_free, type, &listdbt)) != 0)
870 			goto err;
871 		__os_free(dbp->env, pglist);
872 		pglist = NULL;
873 	} else
874 		LSN_NOT_LOGGED(lsn);
875 
876 	LSN(page) = lsn;
877 	if (start == 0)
878 		meta->free = next_free;
879 	else
880 		NEXT_PGNO(page) = next_free;
881 
882 	if (page != (PAGE *)meta && (ret = __memp_fput(mpf,
883 	    dbc->thread_info, page, dbc->priority)) != 0)
884 		goto err;
885 
886 	for (j = start; j < start + size; j++) {
887 		if ((ret = __memp_fget(mpf,
888 		    &list[j], dbc->thread_info,
889 		    dbc->txn, DB_MPOOL_DIRTY, &freepg)) != 0)
890 			goto err;
891 		P_INIT(freepg, dbp->pgsize,
892 		    list[j], PGNO_INVALID, PGNO_INVALID, 0, type);
893 		LSN(freepg) = lsn;
894 		if ((ret = __memp_fput(mpf,
895 		    dbc->thread_info, freepg, dbc->priority)) != 0)
896 			goto err;
897 	}
898 
899 	if (++i != nelems)
900 		memmove(&list[start], &list[i], (nelems - i) * sizeof(*list));
901 	if ((ret = __memp_extend_freelist(mpf, nelems - size, &list)) != 0)
902 		goto err;
903 	if (hash == 0)
904 		ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
905 	t_ret = __TLPUT(dbc, metalock);
906 
907 	return (ret == 0 ? t_ret : ret);
908 
909 err:	if (page != NULL && page != (PAGE *)meta)
910 		(void)__memp_fput(mpf, dbc->thread_info, page, dbc->priority);
911 	if (pglist != NULL)
912 		__os_free(dbp->env, pglist);
913 	if (meta != NULL && hash == 0)
914 		(void)__memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
915 	(void)__TLPUT(dbc, metalock);
916 	return (ret);
917 }
918 #endif
919 
920 /*
921  * __db_relink --
922  *	Relink around a deleted page.
923  *
924  * PUBLIC: int __db_relink __P((DBC *, PAGE *, PAGE *, db_pgno_t));
925  *	Otherp can be either the previous or the next page to use if
926  * the caller already holds that page.
927  */
928 int
__db_relink(dbc,pagep,otherp,new_pgno)929 __db_relink(dbc, pagep, otherp, new_pgno)
930 	DBC *dbc;
931 	PAGE *pagep, *otherp;
932 	db_pgno_t new_pgno;
933 {
934 	DB *dbp;
935 	DB_LOCK npl, ppl;
936 	DB_LSN *nlsnp, *plsnp, ret_lsn;
937 	DB_MPOOLFILE *mpf;
938 	PAGE *np, *pp;
939 	int ret, t_ret;
940 
941 	dbp = dbc->dbp;
942 	np = pp = NULL;
943 	LOCK_INIT(npl);
944 	LOCK_INIT(ppl);
945 	nlsnp = plsnp = NULL;
946 	mpf = dbp->mpf;
947 	ret = 0;
948 
949 	/*
950 	 * Retrieve the one/two pages.  The caller must have them locked
951 	 * because the parent is latched. For a remove, we may need
952 	 * two pages (the before and after).  For an add, we only need one
953 	 * because, the split took care of the prev.
954 	 */
955 	if (pagep->next_pgno != PGNO_INVALID) {
956 		if (((np = otherp) == NULL ||
957 		    PGNO(otherp) != pagep->next_pgno) &&
958 		    (ret = __memp_fget(mpf, &pagep->next_pgno,
959 		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &np)) != 0) {
960 			ret = __db_pgerr(dbp, pagep->next_pgno, ret);
961 			goto err;
962 		}
963 		nlsnp = &np->lsn;
964 	}
965 	if (pagep->prev_pgno != PGNO_INVALID) {
966 		if (((pp = otherp) == NULL ||
967 		    PGNO(otherp) != pagep->prev_pgno) &&
968 		    (ret = __memp_fget(mpf, &pagep->prev_pgno,
969 		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pp)) != 0) {
970 			ret = __db_pgerr(dbp, pagep->prev_pgno, ret);
971 			goto err;
972 		}
973 		plsnp = &pp->lsn;
974 	}
975 
976 	/* Log the change. */
977 	if (DBC_LOGGING(dbc)) {
978 		if ((ret = __db_relink_log(dbp, dbc->txn, &ret_lsn, 0,
979 		    pagep->pgno, new_pgno, pagep->prev_pgno, plsnp,
980 		    pagep->next_pgno, nlsnp)) != 0)
981 			goto err;
982 	} else
983 		LSN_NOT_LOGGED(ret_lsn);
984 	if (np != NULL)
985 		np->lsn = ret_lsn;
986 	if (pp != NULL)
987 		pp->lsn = ret_lsn;
988 
989 	/*
990 	 * Modify and release the two pages.
991 	 */
992 	if (np != NULL) {
993 		if (new_pgno == PGNO_INVALID)
994 			np->prev_pgno = pagep->prev_pgno;
995 		else
996 			np->prev_pgno = new_pgno;
997 		if (np != otherp)
998 			ret = __memp_fput(mpf,
999 			    dbc->thread_info, np, dbc->priority);
1000 		if ((t_ret = __TLPUT(dbc, npl)) != 0 && ret == 0)
1001 			ret = t_ret;
1002 		if (ret != 0)
1003 			goto err;
1004 	}
1005 
1006 	if (pp != NULL) {
1007 		if (new_pgno == PGNO_INVALID)
1008 			pp->next_pgno = pagep->next_pgno;
1009 		else
1010 			pp->next_pgno = new_pgno;
1011 		if (pp != otherp)
1012 			ret = __memp_fput(mpf,
1013 			    dbc->thread_info, pp, dbc->priority);
1014 		if ((t_ret = __TLPUT(dbc, ppl)) != 0 && ret == 0)
1015 			ret = t_ret;
1016 		if (ret != 0)
1017 			goto err;
1018 	}
1019 	return (0);
1020 
1021 err:	if (np != NULL && np != otherp)
1022 		(void)__memp_fput(mpf, dbc->thread_info, np, dbc->priority);
1023 	if (pp != NULL && pp != otherp)
1024 		(void)__memp_fput(mpf, dbc->thread_info, pp, dbc->priority);
1025 	return (ret);
1026 }
1027 
1028 /*
1029  * __db_move_metadata -- move a meta data page to a lower page number.
1030  * The meta data page must be exclusively latched on entry.
1031  *
1032  * PUBLIC: int __db_move_metadata
1033  * PUBLIC:     __P((DBC *, DBMETA **, DB_COMPACT *, int *));
1034  */
1035 int
__db_move_metadata(dbc,metap,c_data,pgs_donep)1036 __db_move_metadata(dbc, metap, c_data, pgs_donep)
1037 	DBC *dbc;
1038 	DBMETA **metap;
1039 	DB_COMPACT *c_data;
1040 	int *pgs_donep;
1041 {
1042 	BTREE *bt;
1043 	DB *dbp, *mdbp;
1044 	DB_LOCK handle_lock;
1045 	HASH *ht;
1046 	int ret, t_ret;
1047 
1048 	dbp = dbc->dbp;
1049 
1050 	c_data->compact_pages_examine++;
1051 	if ((ret = __db_exchange_page(dbc,
1052 	     (PAGE **)metap, NULL, PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0)
1053 		return (ret);
1054 
1055 	if (PGNO(*metap) == dbp->meta_pgno)
1056 		return (0);
1057 
1058 	if ((ret = __db_master_open(dbp,
1059 	     dbc->thread_info, dbc->txn, dbp->fname, 0, 0, &mdbp)) != 0)
1060 		return (ret);
1061 
1062 	dbp->meta_pgno = PGNO(*metap);
1063 
1064 	if ((ret = __db_master_update(mdbp, dbp, dbc->thread_info,
1065 	     dbc->txn, dbp->dname, dbp->type, MU_MOVE, NULL, 0)) != 0)
1066 		goto err;
1067 
1068 	/*
1069 	 * The handle lock for subdb's depends on the metadata page number:
1070 	 * swap the old one for the new one.
1071 	 */
1072 	if (STD_LOCKING(dbc)) {
1073 		/*
1074 		 * If this dbp is still in an opening transaction we need to
1075 		 * change its lock in the event.
1076 		 */
1077 		if (dbp->cur_txn != NULL)
1078 			__txn_remlock(dbp->env,
1079 			    dbp->cur_txn, &dbp->handle_lock, dbp->locker);
1080 
1081 		handle_lock = dbp->handle_lock;
1082 		if ((ret = __fop_lock_handle(dbp->env, dbp,
1083 		    dbp->cur_locker != NULL ? dbp->cur_locker : dbp->locker,
1084 		    dbp->cur_txn != NULL ? DB_LOCK_WRITE : DB_LOCK_READ,
1085 		    NULL, 0)) != 0)
1086 			goto err;
1087 
1088 		/* Move all the other handles to the new lock. */
1089 		if ((ret = __lock_change(dbp->env,
1090 		    &handle_lock, &dbp->handle_lock)) != 0)
1091 			goto err;
1092 
1093 		/* Reregister the event. */
1094 		if (dbp->cur_txn != NULL)
1095 			ret = __txn_lockevent(dbp->env,
1096 			    dbp->cur_txn, dbp, &dbp->handle_lock, dbp->locker);
1097 	}
1098 	if (dbp->log_filename != NULL)
1099 		dbp->log_filename->meta_pgno = dbp->meta_pgno;
1100 	if (dbp->type == DB_HASH) {
1101 		ht = dbp->h_internal;
1102 		ht->meta_pgno = dbp->meta_pgno;
1103 		ht->revision = ++dbp->mpf->mfp->revision;
1104 	} else {
1105 		bt = dbp->bt_internal;
1106 		bt->bt_meta = dbp->meta_pgno;
1107 		bt->revision = ++dbp->mpf->mfp->revision;
1108 	}
1109 
1110 err:	if ((t_ret = __db_close(mdbp, dbc->txn, DB_NOSYNC)) != 0 && ret == 0)
1111 		ret = t_ret;
1112 	return (ret);
1113 }
1114