src/hash/hash_compact.c

/*-
 * Copyright (c) 1996, 2020 Oracle and/or its affiliates.  All rights reserved.
 *
 * See the file LICENSE for license information.
 * $Id$
 */

#include "db_config.h"

#include "db_int.h"
#include "dbinc/db_page.h"
#include "dbinc/btree.h"
#include "dbinc/hash.h"
#include "dbinc/lock.h"
#include "dbinc/txn.h"
#include "dbinc/mp.h"

static int __ham_copy_data __P((DBC *, PAGE *, DB_COMPACT *, int *));
static int __ham_truncate_overflow __P((DBC *, u_int32_t, DB_COMPACT *, int *));

/*
 * __ham_compact_int -- internal HASH compaction routine.
 *
 * PUBLIC: int __ham_compact_int __P((DBC *,
 * PUBLIC:      DBT *, DBT *, u_int32_t, DB_COMPACT *, int *, u_int32_t));
 */
int
__ham_compact_int(dbc, start, stop, factor, c_data, donep, flags)
	DBC *dbc;
	DBT *start, *stop;
	u_int32_t factor;
	DB_COMPACT *c_data;
	int *donep;
	u_int32_t flags;
{
	DB *dbp;
	DB_MPOOLFILE *mpf;
	HASH_CURSOR *hcp;
	db_pgno_t origpgno, pgno;
	int check_trunc, pgs_done, ret, t_ret;
	u_int32_t empty_buckets, i, stop_bucket;

	dbp = dbc->dbp;
	mpf = dbp->mpf;
	hcp = (HASH_CURSOR *)dbc->internal;
	pgs_done = 0;
	empty_buckets = 0;
	check_trunc = c_data->compact_truncate != PGNO_INVALID;
	if ((ret = __ham_get_meta(dbc)) != 0)
		return (ret);

	if (stop != NULL && stop->size != 0)
		stop_bucket = *(u_int32_t *)stop->data;
	else
		stop_bucket = hcp->hdr->max_bucket;

	if (start != NULL && start->size != 0)
		hcp->bucket = *(u_int32_t *)start->data;
	else
		hcp->bucket = 0;

	for (; hcp->bucket <= stop_bucket && ret == 0; hcp->bucket++) {
		/*
		 * For each bucket first move records toward the head of
		 * the bucket.
		 */
		hcp->indx = NDX_INVALID;
		F_CLR(hcp, H_ISDUP);
		hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
		pgno = PGNO_INVALID;
		ret = __ham_item_next(dbc, DB_LOCK_WRITE, &pgno);

		/*
		 * If the bucket is empty, just note it, otherwise process it.
		 * If there are any records there must be some in the head
		 * of the bucket.
		 */
		if (ret == DB_NOTFOUND ) {
			empty_buckets++;
			c_data->compact_pages_examine++;
			DB_ASSERT(dbp->env,
			    PREV_PGNO(hcp->page) == PGNO_INVALID &&
			    NEXT_PGNO(hcp->page) == PGNO_INVALID);
			goto err;
		} else if (ret != 0)
			break;
		c_data->compact_pages_examine++;

		if (NEXT_PGNO(hcp->page) != PGNO_INVALID) {
			if ((ret =
			    __ham_compact_bucket(dbc, c_data, &pgs_done)) != 0)
				goto err;
			pgno = PGNO_INVALID;
			if ((ret = __ham_item(dbc, DB_LOCK_WRITE, &pgno)) != 0)
				goto err;
		}

		/*
		 * Loop through the items in this page in the bucket and process
		 * overflow records and off page duplicate sets.
		 */
		while (ret == 0) {
			/* Handle off page duplicate trees. */
			if (pgno == PGNO_INVALID)
				goto no_opd;
			if (check_trunc &&
			    pgno > c_data->compact_truncate) {
				c_data->compact_pages_examine++;
				/*
				 * Truncate this page if possible.
				 * We must update the parent here
				 * because the page number is
				 * not aligned.
				 */
				if ((ret = __memp_dirty(mpf, &hcp->page,
				    dbc->thread_info,
				    dbc->txn, dbc->priority, 0)) != 0)
					break;
				origpgno = pgno;
				if ((ret = __db_truncate_root(dbc, hcp->page,
				    H_DATAINDEX(hcp->indx),
				    &pgno, 0, &pgs_done)) != 0)
					break;
				if (pgno != origpgno) {
					memcpy(HOFFDUP_PGNO(H_PAIRDATA(dbp,
					    hcp->page, hcp->indx)),
					    &pgno, sizeof(db_pgno_t));
					pgs_done++;
					c_data->compact_pages--;
				}
			}
			/*
			 * Compact the off page duplicate tree.
			 */
			if ((ret = __bam_compact_opd(dbc,
			    pgno, NULL, factor, c_data, &pgs_done)) != 0)
				break;

no_opd:			if (check_trunc && HPAGE_PTYPE(H_PAIRDATA(
			    dbp, hcp->page, hcp->indx)) == H_OFFPAGE) {
				/* This is an overflow chain. */
				if ((ret = __ham_truncate_overflow(dbc,
				    H_DATAINDEX(hcp->indx),
				    c_data, &pgs_done)) != 0)
					break;
			}

			/* Check for an overflow key. */
			if (check_trunc && HPAGE_PTYPE(H_PAIRKEY(
			    dbp, hcp->page, hcp->indx)) == H_OFFPAGE) {
				/* This is an overflow chain. */
				if ((ret = __ham_truncate_overflow(dbc,
				    H_KEYINDEX(hcp->indx),
				    c_data, &pgs_done)) != 0)
					break;
			}

			pgno = PGNO_INVALID;
			ret = __ham_item_next(dbc, DB_LOCK_WRITE, &pgno);
		}

err:		if (hcp->page != NULL &&
		    (t_ret = __memp_fput(mpf, dbc->thread_info,
		    hcp->page, dbc->priority)) != 0 && ret == 0)
			ret = t_ret;
		if (ret == DB_NOTFOUND)
			ret = 0;
		hcp->page = NULL;
		hcp->pgno = pgno = PGNO_INVALID;
		/*
		 * If we are in an auto-transaction and we updated something
		 * return to the caller to commit this transaction to
		 * avoid holding locks. Otherwise process the next bucket.
		 * We can drop the lock if we did not do anything.
		 * We always must commit the txn if we are in MVCC
		 * as we have dirtied the hash buckets.
		 */
		if (ret == 0 &&
		    atomic_read(&dbp->mpf->mfp->multiversion) == 0 &&
		    (pgs_done == 0 || dbc->txn == NULL))
			ret = __LPUT(dbc, hcp->lock);
		else if (LF_ISSET(DB_AUTO_COMMIT)) {
			if (ret == 0)
				hcp->bucket++;
			break;
		}
	}
	/*
	 * If we saw any empty buckets and we are freeing space we
	 * want to contract the table before dropping the metadata
	 * page. Wait till we are done with everything else as we
	 * need to get an exclusive lock on the metadata page.
	 */
	if (ret == 0 && empty_buckets != 0 && LF_ISSET(DB_FREE_SPACE)) {
		for (i = 0; i < empty_buckets && hcp->hdr->max_bucket > 2; i++)
			if ((ret = __ham_contract_table(dbc, c_data)) != 0)
				break;
	}

	if (ret == 0)
		ret = __db_retcopy(dbp->env, start, &hcp->bucket,
		    sizeof(hcp->bucket), &start->data, &start->ulen);
	(void)__ham_release_meta(dbc);
	c_data->compact_empty_buckets += empty_buckets;
	if (hcp->bucket > stop_bucket)
		*donep = 1;
	return (ret);
}

/*
 * __ham_compact_bucket -- move data to as few pages as possible.
 *
 * PUBLIC: int __ham_compact_bucket __P((DBC *, DB_COMPACT *, int *));
 */
int
__ham_compact_bucket(dbc, c_data, pgs_donep)
	DBC *dbc;
	DB_COMPACT *c_data;
	int *pgs_donep;
{
	DB *dbp;
	DB_MPOOLFILE *mpf;
	HASH_CURSOR *hcp;
	PAGE *pg;
	db_pgno_t pgno;
	int check_trunc, ret, t_ret;

	hcp = (HASH_CURSOR *)dbc->internal;
	dbp = dbc->dbp;
	mpf = dbp->mpf;
	pg = hcp->page;
	check_trunc = c_data->compact_truncate != PGNO_INVALID;
	ret = 0;

	pgno = hcp->pgno;
	do {
		if (pg == NULL && (ret = __memp_fget(mpf, &pgno,
		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0)
			break;
		/* Sort any unsorted pages before adding to the page. */
		if (TYPE(pg) == P_HASH_UNSORTED) {
			if ((ret = __ham_sort_page_cursor(dbc, pg)) != 0)
				break;
			(*pgs_donep)++;
		}

		/* If this is not the head try to move it to a lower page. */
		if (check_trunc && PREV_PGNO(pg) != PGNO_INVALID  &&
		    PGNO(pg) > c_data->compact_truncate &&
		    (ret = __db_exchange_page(dbc, &pg,
		    hcp->page, PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0)
			break;
		if (pgno != PGNO(pg))
			(*pgs_donep)++;

		if (NEXT_PGNO(pg) == PGNO_INVALID)
			break;
		if ((ret = __ham_copy_data(dbc, pg, c_data, pgs_donep)) != 0)
			break;
		pgno = NEXT_PGNO(pg);
		if (pg != hcp->page && (ret = __memp_fput(mpf,
		     dbc->thread_info, pg, dbc->priority)) != 0)
			break;
		pg = NULL;
	} while (pgno != PGNO_INVALID);

	if (pg != NULL && pg != hcp->page &&
	    (t_ret = __memp_fput(mpf, dbc->thread_info, pg, dbc->priority)) &&
	    ret == 0)
		ret = t_ret;
	return (ret);
}

/*
 * __ham_copy_data -- copy as many records as possible from next page
 */
static int
__ham_copy_data(dbc, pg, c_data, pgs_donep)
	DBC *dbc;
	PAGE *pg;
	DB_COMPACT *c_data;
	int *pgs_donep;
{
	DB *dbp;
	DBC *newdbc;
	DBT data, key;
	DB_MPOOLFILE *mpf;
	HASH_CURSOR *hcp, *ncp;
	PAGE *nextpage;
	db_pgno_t origpgno;
	int i, nument, records, ret, t_ret;
	u_int32_t len;

	dbp = dbc->dbp;
	mpf = dbp->mpf;
	hcp = (HASH_CURSOR *)dbc->internal;
	records = 0;

	if ((ret = __dbc_dup(dbc, &newdbc, 0)) != 0)
		return (ret);
	ncp = (HASH_CURSOR *)newdbc->internal;
	ncp->hdr = hcp->hdr;

	/*
	 * Copy data to the front of the bucket. Loop until either we
	 * have not replaced the next page or there is no next page.
	 * If the next page was not removed then it still has data
	 * on it.
	 */
	origpgno = PGNO_INVALID;
	while (origpgno != NEXT_PGNO(pg) &&
	    (origpgno = NEXT_PGNO(pg)) != PGNO_INVALID) {

		if ((ret = __memp_fget(mpf, &NEXT_PGNO(pg), dbc->thread_info,
		    dbc->txn, DB_MPOOL_DIRTY, &nextpage)) != 0)
			break;

		c_data->compact_pages_examine++;
		ncp->page = nextpage;
		ncp->pgno = PGNO(nextpage);
		ncp->indx = 0;
		memset(&key, 0, sizeof(key));
		memset(&data, 0, sizeof(data));
		nument = NUM_ENT(nextpage);
		DB_ASSERT(dbp->env, nument != 0);
		for (i = 0; i < nument; i += 2) {
			len = LEN_HITEM(dbp, nextpage, dbp->pgsize, 0) +
			    LEN_HITEM(dbp, nextpage, dbp->pgsize, 1) +
			    2 * sizeof(db_indx_t);
			if (P_FREESPACE(dbp, pg) < len)
				continue;

			if ((ret =
			    __ham_copypair(dbc, nextpage, 0, pg, NULL, 1)) != 0)
				break;

			records++;
			if ((ret = __ham_del_pair(newdbc,
			    HAM_DEL_IGNORE_OFFPAGE, pg)) != 0)
				break;
			if (!STD_LOCKING(dbc)) {
				if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
					return (ret);
				++hcp->hdr->nelem;
			}
		}
		/*
		 * If we moved all the records then __ham_del_pair will
		 * have deleted the nextpage.
		 */
		if (records >= nument/2) {
			c_data->compact_pages_examine++;
			c_data->compact_pages_free++;
			COMPACT_TRUNCATE(c_data);
		}
		if (ncp->page != NULL &&
		    (t_ret = __memp_fput(mpf, dbc->thread_info,
		    ncp->page, dbc->priority)) != 0 && ret == 0)
			ret = t_ret;
		ncp->page = NULL;
		ncp->pgno = PGNO_INVALID;
	}

	/*
	 * If __ham_del_pair freed a page then we needed to dirty the metapage
	 * and it could change so we need to copy it back to hcp.
	 */
	hcp->hdr = ncp->hdr;
	ncp->hdr = NULL;
	if ((t_ret = __ham_release_meta(newdbc)) != 0 && ret == 0)
		ret = t_ret;
	if ((t_ret = __dbc_close(newdbc)) != 0 && ret == 0)
		ret = t_ret;
	if (records != 0)
		(*pgs_donep)++;
	return (ret);
}

/*
 * __ham_truncate_overflow -- try to truncate pages from an overflow chain.
 */
static int
__ham_truncate_overflow(dbc, indx, c_data, pgs_done)
	DBC *dbc;
	u_int32_t indx;
	DB_COMPACT *c_data;
	int *pgs_done;
{
	DB *dbp;
	HASH_CURSOR *hcp;
	db_pgno_t origpgno, pgno;
	int ret;

	hcp = (HASH_CURSOR *)dbc->internal;
	dbp = dbc->dbp;
	memcpy(&pgno,
	    HOFFPAGE_PGNO(P_ENTRY(dbp, hcp->page, indx)), sizeof(db_pgno_t));
	if (pgno > c_data->compact_truncate) {
		c_data->compact_pages_examine++;
		origpgno = pgno;
		if ((ret = __memp_dirty(dbp->mpf, &hcp->page,
		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
			return (ret);
		if ((ret = __db_truncate_root(dbc,
		    hcp->page, indx, &pgno, 0, pgs_done)) != 0)
			return (ret);
		if (pgno != origpgno) {
			memcpy(HOFFPAGE_PGNO(P_ENTRY(dbp, hcp->page, indx)),
			    &pgno, sizeof(db_pgno_t));
			(*pgs_done)++;
			c_data->compact_pages--;
		}
	}
	if ((ret =
	    __db_truncate_overflow(dbc, pgno, NULL, c_data, pgs_done)) != 0)
		return (ret);
	return (0);
}

#ifdef HAVE_FTRUNCATE
/*
 * __ham_compact_hash -- compact the hash table.
 * PUBLIC: int __ham_compact_hash __P((DB *,
 * PUBLIC:       DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *));
 */
int
__ham_compact_hash(dbp, ip, txn, c_data)
	DB *dbp;
	DB_THREAD_INFO *ip;
	DB_TXN *txn;
	DB_COMPACT *c_data;
{
	DBC *dbc;
	DB_LOCK lock;
	HASH_CURSOR *hcp;
	HMETA *meta;
	PAGE *oldpage;
	db_pgno_t free_pgno, last_pgno, pgno, start_pgno;
	int flags, local_txn, pgs_done, ret, t_ret;
	u_int32_t bucket, i, size;

	local_txn = IS_DB_AUTO_COMMIT(dbp, txn);
	pgs_done = 0;
	oldpage = NULL;
	dbc = NULL;
	LOCK_INIT(lock);

	if (local_txn &&
	    (ret = __txn_begin(dbp->env, ip, txn, &txn, 0)) != 0)
		return (ret);

	if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
		goto err1;
	hcp = (HASH_CURSOR *)dbc->internal;

	if ((ret = __ham_get_meta(dbc)) != 0 ||
	    (ret = __ham_dirty_meta(dbc, 0)) != 0)
		goto err1;

	meta = hcp->hdr;

	LOCK_CHECK_OFF(ip);

	/*
	 * Find contiguous lower numbered pages for each hash table segment.
	 */
	for (i = 0; i < NCACHED && meta->spares[i] != PGNO_INVALID; i++) {
		if (i == 0) {
			bucket = 0;
			size = 1;
		} else {
			bucket = 1 << (i - 1);
			size = bucket;
		}
		start_pgno = meta->spares[i] + bucket;
		if ((ret = __db_find_free(dbc, P_HASH,
		    size, start_pgno, &free_pgno)) != 0) {
			if (ret != DB_NOTFOUND)
				break;
			ret = 0;
			continue;
		}
		if (DBC_LOGGING(dbc)) {
			if ((ret = __ham_changeslot_log(dbp,
			    dbc->txn, &LSN(meta),
			    0, &LSN(meta), i, start_pgno, free_pgno)) != 0)
				break;
		} else
			LSN_NOT_LOGGED(LSN(meta));
		last_pgno = free_pgno + bucket;
		/*
		 * March through the list swapping pages.  If the page is
		 * empty we just need to free it.  If we are just sliding
		 * things down don't free the pages that will be reused.
		 * Note that __db_exchange_page returns the new page so
		 * we must put it.
		 */
		for (pgno = start_pgno;
		    pgno < start_pgno + size; pgno++, free_pgno++) {
			if ((ret = __db_lget(dbc,
			    LCK_COUPLE, pgno, DB_LOCK_WRITE, 0, &lock)) != 0)
				goto err;
			if ((ret = __memp_fget(dbp->mpf, &pgno,
			     dbc->thread_info, dbc->txn,
			     DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &oldpage)) != 0)
				goto err;
			if (NUM_ENT(oldpage) != 0) {
				if (pgno < last_pgno)
					flags = 0;
				else
					flags = DB_EXCH_FREE;
				if ((ret = __db_exchange_page(dbc, &oldpage,
				    NULL, free_pgno, flags, &pgs_done)) != 0)
					goto err;
			} else if (pgno >= last_pgno) {
				if ((ret = __db_free(dbc, oldpage, 0)) != 0)
					goto err;
				COMPACT_TRUNCATE(c_data);
				oldpage = NULL;
			}
			if (oldpage != NULL && (ret = __memp_fput(dbp->mpf,
			    dbc->thread_info, oldpage, dbc->priority)) != 0)
				goto err;
			ret = 0;
			oldpage = NULL;
			c_data->compact_pages_examine++;
		}
		meta->spares[i] = free_pgno - (size + bucket);
	}
	if (ret == 0 && F_ISSET(dbp, DB_AM_SUBDB) &&
	    PGNO(hcp->hdr) > c_data->compact_truncate)
		ret = __db_move_metadata(dbc, (DBMETA**)&hcp->hdr,
		    c_data, &pgs_done);

err:	if (oldpage != NULL && (t_ret = __memp_fput(dbp->mpf,
	    dbc->thread_info, oldpage, dbc->priority)) != 0 && ret == 0)
		ret = t_ret;
	if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
		ret = t_ret;
	LOCK_CHECK_ON(ip);
err1:	if (dbc != NULL) {
		if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
			ret = t_ret;
		if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
			ret = t_ret;
	}
	if (local_txn && (t_ret = (ret == 0 ?
	    __txn_commit(txn, 0) : __txn_abort(txn))) != 0 && ret == 0)
		ret = t_ret;

	return (ret);
}
#endif