src/db/db_am.c

/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1998, 2013 Oracle and/or its affiliates.  All rights reserved.
 *
 * $Id$
 */

#include "db_config.h"

#include "db_int.h"
#include "dbinc/db_page.h"
#include "dbinc/btree.h"
#include "dbinc/hash.h"
#include "dbinc/heap.h"
#include "dbinc/lock.h"
#include "dbinc/mp.h"
#include "dbinc/partition.h"
#include "dbinc/qam.h"
#include "dbinc/txn.h"

static int __db_secondary_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
static int __dbc_set_priority __P((DBC *, DB_CACHE_PRIORITY));
static int __dbc_get_priority __P((DBC *, DB_CACHE_PRIORITY* ));

/*
 * __db_cursor_int --
 *	Internal routine to create a cursor.
 *
 * PUBLIC: int __db_cursor_int __P((DB *, DB_THREAD_INFO *,
 * PUBLIC:     DB_TXN *, DBTYPE, db_pgno_t, int, DB_LOCKER *, DBC **));
 */
int
__db_cursor_int(dbp, ip, txn, dbtype, root, flags, locker, dbcp)
	DB *dbp;
	DB_THREAD_INFO *ip;
	DB_TXN *txn;
	DBTYPE dbtype;
	db_pgno_t root;
	int flags;
	DB_LOCKER *locker;
	DBC **dbcp;
{
	DBC *dbc;
	DBC_INTERNAL *cp;
	DB_LOCKREQ req;
	ENV *env;
	db_threadid_t tid;
	int allocated, envlid, ret;
	pid_t pid;

	env = dbp->env;
	allocated = envlid = 0;

	/*
	 * If dbcp is non-NULL it is assumed to point to an area to initialize
	 * as a cursor.
	 *
	 * Take one from the free list if it's available.  Take only the
	 * right type.  With off page dups we may have different kinds
	 * of cursors on the queue for a single database.
	 */
	MUTEX_LOCK(env, dbp->mutex);

#ifndef HAVE_NO_DB_REFCOUNT
	/*
	 * If this DBP is being logged then refcount the log filename
	 * relative to this transaction. We do this here because we have
	 * the dbp->mutex which protects the refcount.  We want to avoid
	 * calling the function if the transaction handle has a shared parent
	 * locker or we are duplicating a cursor.  This includes the case of
	 * creating an off page duplicate cursor.
	 * If we knew this cursor will not be used in an update, we could avoid
	 * this, but we don't have that information.
	 */
	if (IS_REAL_TXN(txn) &&
	    !LF_ISSET(DBC_OPD | DBC_DUPLICATE) &&
	    !F_ISSET(dbp, DB_AM_RECOVER) &&
	    dbp->log_filename != NULL && !IS_REP_CLIENT(env) &&
	    (ret = __txn_record_fname(env, txn, dbp->log_filename)) != 0) {
		MUTEX_UNLOCK(env, dbp->mutex);
		return (ret);
	}

#endif

	TAILQ_FOREACH(dbc, &dbp->free_queue, links)
		if (dbtype == dbc->dbtype) {
			TAILQ_REMOVE(&dbp->free_queue, dbc, links);
			F_CLR(dbc, ~DBC_OWN_LID);
			break;
		}
	MUTEX_UNLOCK(env, dbp->mutex);

	if (dbc == NULL) {
		if ((ret = __os_calloc(env, 1, sizeof(DBC), &dbc)) != 0)
			return (ret);
		allocated = 1;
		dbc->flags = 0;

		dbc->dbp = dbp;
		dbc->dbenv = dbp->dbenv;
		dbc->env = dbp->env;

		/* Set up locking information. */
		if (LOCKING_ON(env)) {
			/*
			 * If we are not threaded, we share a locker ID among
			 * all cursors opened in the environment handle,
			 * allocating one if this is the first cursor.
			 *
			 * This relies on the fact that non-threaded DB handles
			 * always have non-threaded environment handles, since
			 * we set DB_THREAD on DB handles created with threaded
			 * environment handles.
			 */
			if (!DB_IS_THREADED(dbp)) {
				if (env->env_lref == NULL) {
					if ((ret = __lock_id(env,
					    NULL, &env->env_lref)) != 0)
						goto err;
				       envlid = 1;
				}
				dbc->lref = env->env_lref;
			}

			/*
			 * In CDB, secondary indices should share a lock file
			 * ID with the primary;  otherwise we're susceptible
			 * to deadlocks.  We also use __db_cursor_int rather
			 * than __db_cursor to create secondary update cursors
			 * in c_put and c_del; these won't acquire a new lock.
			 *
			 * !!!
			 * Since this is in the one-time cursor allocation
			 * code, we need to be sure to destroy, not just
			 * close, all cursors in the secondary when we
			 * associate.
			 */
			if (CDB_LOCKING(env) &&
			    F_ISSET(dbp, DB_AM_SECONDARY))
				memcpy(dbc->lock.fileid,
				    dbp->s_primary->fileid, DB_FILE_ID_LEN);
			else
				memcpy(dbc->lock.fileid,
				    dbp->fileid, DB_FILE_ID_LEN);

			if (CDB_LOCKING(env)) {
				if (F_ISSET(env->dbenv, DB_ENV_CDB_ALLDB)) {
					/*
					 * If we are doing a single lock per
					 * environment, set up the global
					 * lock object just like we do to
					 * single thread creates.
					 */
					DB_ASSERT(env, sizeof(db_pgno_t) ==
					    sizeof(u_int32_t));
					dbc->lock_dbt.size = sizeof(u_int32_t);
					dbc->lock_dbt.data = &dbc->lock.pgno;
					dbc->lock.pgno = 0;
				} else {
					dbc->lock_dbt.size = DB_FILE_ID_LEN;
					dbc->lock_dbt.data = dbc->lock.fileid;
				}
			} else {
				dbc->lock.type = DB_PAGE_LOCK;
				dbc->lock_dbt.size = sizeof(dbc->lock);
				dbc->lock_dbt.data = &dbc->lock;
			}
		}
		/* Init the DBC internal structure. */
#ifdef HAVE_PARTITION
		if (DB_IS_PARTITIONED(dbp)) {
			if ((ret = __partc_init(dbc)) != 0)
				goto err;
		} else
#endif
		switch (dbtype) {
		case DB_BTREE:
		case DB_RECNO:
			if ((ret = __bamc_init(dbc, dbtype)) != 0)
				goto err;
			break;
		case DB_HASH:
			if ((ret = __hamc_init(dbc)) != 0)
				goto err;
			break;
		case DB_HEAP:
			if ((ret = __heapc_init(dbc)) != 0)
				goto err;
			break;
		case DB_QUEUE:
			if ((ret = __qamc_init(dbc)) != 0)
				goto err;
			break;
		case DB_UNKNOWN:
		default:
			ret = __db_unknown_type(env, "DB->cursor", dbtype);
			goto err;
		}

		cp = dbc->internal;
	}

	/* Refresh the DBC structure. */
	dbc->dbtype = dbtype;
	RESET_RET_MEM(dbc);
	dbc->set_priority = __dbc_set_priority;
	dbc->get_priority = __dbc_get_priority;
	dbc->priority = dbp->priority;
	dbc->txn_cursors.tqe_next = NULL;
	dbc->txn_cursors.tqe_prev = NULL;

	/*
	 * If the DB handle is not threaded, there is one locker ID for the
	 * whole environment.  There should only one family transaction active
	 * as well.  This doesn't apply to CDS group transactions, where the
	 * cursor can simply use the transaction's locker directly.
	 */
	if (!CDB_LOCKING(env) && txn != NULL && F_ISSET(txn, TXN_FAMILY) &&
	    (F_ISSET(dbc, DBC_OWN_LID) || dbc->lref == NULL || envlid))  {
		if (LOCKING_ON(env)) {
			if (dbc->lref == NULL) {
				if ((ret =
				    __lock_id(env, NULL, &dbc->lref)) != 0)
					goto err;
				F_SET(dbc, DBC_OWN_LID);
			}
			if ((ret = __lock_addfamilylocker(env,
			    txn->txnid, dbc->lref->id, 1)) != 0)
				goto err;
		}
		F_SET(dbc, DBC_FAMILY);
		txn = NULL;
	}

	if ((dbc->txn = txn) != NULL)
		dbc->locker = txn->locker;
	else if (LOCKING_ON(env)) {
		/*
		 * There are certain cases in which we want to create a
		 * new cursor with a particular locker ID that is known
		 * to be the same as (and thus not conflict with) an
		 * open cursor.
		 *
		 * The most obvious case is cursor duplication;  when we
		 * call DBC->dup or __dbc_idup, we want to use the original
		 * cursor's locker ID.
		 *
		 * Another case is when updating secondary indices.  Standard
		 * CDB locking would mean that we might block ourself:  we need
		 * to open an update cursor in the secondary while an update
		 * cursor in the primary is open, and when the secondary and
		 * primary are subdatabases or we're using env-wide locking,
		 * this is disastrous.
		 *
		 * In these cases, our caller will pass a nonzero locker
		 * ID into this function.  Use this locker ID instead of
		 * the default as the locker ID for our new cursor.
		 */
		if (locker != NULL)
			dbc->locker = locker;
		else if (LF_ISSET(DB_RECOVER))
			dbc->locker = NULL;
		else {
			if (dbc->lref == NULL) {
				if ((ret =
				    __lock_id(env, NULL, &dbc->lref)) != 0)
					goto err;
				F_SET(dbc, DBC_OWN_LID);
			}
			/*
			 * If we are threaded then we need to set the
			 * proper thread id into the locker.
			 */
			if (DB_IS_THREADED(dbp)) {
				env->dbenv->thread_id(env->dbenv, &pid, &tid);
				__lock_set_thread_id(dbc->lref, pid, tid);
			}
			dbc->locker = dbc->lref;
		}
	}

	/*
	 * These fields change when we are used as a secondary index, so
	 * if the DB is a secondary, make sure they're set properly just
	 * in case we opened some cursors before we were associated.
	 *
	 * __dbc_get is used by all access methods, so this should be safe.
	 */
	if (F_ISSET(dbp, DB_AM_SECONDARY))
		dbc->get = dbc->c_get = __dbc_secondary_get_pp;

	/*
	 * Don't enable bulk for btrees with record numbering, since avoiding
	 * a full search avoids taking write locks necessary to maintain
	 * consistent numbering.
	 */
	if (LF_ISSET(DB_CURSOR_BULK) && dbtype == DB_BTREE &&
	    !F_ISSET(dbp, DB_AM_RECNUM))
		F_SET(dbc, DBC_BULK);
	if (LF_ISSET(DB_CURSOR_TRANSIENT))
		F_SET(dbc, DBC_TRANSIENT);
	if (LF_ISSET(DBC_OPD))
		F_SET(dbc, DBC_OPD);
	if (F_ISSET(dbp, DB_AM_RECOVER) || LF_ISSET(DB_RECOVER))
		F_SET(dbc, DBC_RECOVER);
	if (F_ISSET(dbp, DB_AM_COMPENSATE))
		F_SET(dbc, DBC_DONTLOCK);
	/*
	* If this database is exclusive then the cursor
	* does not need to get locks.
	*/
	if (F2_ISSET(dbp, DB2_AM_EXCL)) {
		F_SET(dbc, DBC_DONTLOCK);
		if (IS_REAL_TXN(txn)&& !LF_ISSET(DBC_OPD | DBC_DUPLICATE)) {
			/*
			 * Exclusive databases can only have one active
			 * transaction at a time since there are no internal
			 * locks to prevent one transaction from reading and
			 * writing another's uncommitted changes.
			 */
			if (dbp->cur_txn != NULL && dbp->cur_txn != txn) {
			    __db_errx(env, DB_STR("0749",
"Exclusive database handles can only have one active transaction at a time."));
				ret = EINVAL;
				goto err;
			}
			/* Do not trade a second time. */
			if (dbp->cur_txn != txn) {
				/* Trade the handle lock to the txn locker. */
				memset(&req, 0, sizeof(req));
				req.lock = dbp->handle_lock;
				req.op = DB_LOCK_TRADE;
				if ((ret = __lock_vec(env, txn->locker, 0,
				    &req, 1, 0)) != 0)
					goto err;
				dbp->cur_txn = txn;
				dbp->cur_locker = txn->locker;
				if ((ret = __txn_lockevent(env, txn, dbp,
				    &dbp->handle_lock, dbp->locker)) != 0)
					goto err;
			}
		}
	}
#ifdef HAVE_REPLICATION
	/*
	 * If we are replicating from a down rev version then we must
	 * use old locking protocols.
	 */
	if (LOGGING_ON(env) &&
	     ((LOG *)env->lg_handle->
	     reginfo.primary)->persist.version < DB_LOGVERSION_LATCHING)
		F_SET(dbc, DBC_DOWNREV);
#endif

	/* Refresh the DBC internal structure. */
	cp = dbc->internal;
	cp->opd = NULL;
	cp->pdbc = NULL;

	cp->indx = 0;
	cp->page = NULL;
	cp->pgno = PGNO_INVALID;
	cp->root = root;
	cp->stream_start_pgno = cp->stream_curr_pgno = PGNO_INVALID;
	cp->stream_off = 0;

	if (DB_IS_PARTITIONED(dbp)) {
		DBC_PART_REFRESH(dbc);
	} else switch (dbtype) {
	case DB_BTREE:
	case DB_RECNO:
		if ((ret = __bamc_refresh(dbc)) != 0)
			goto err;
		break;
	case DB_HEAP:
		if ((ret = __heapc_refresh(dbc)) != 0)
			goto err;
		break;
	case DB_HASH:
	case DB_QUEUE:
		break;
	case DB_UNKNOWN:
	default:
		ret = __db_unknown_type(env, "DB->cursor", dbp->type);
		goto err;
	}

	/*
	 * The transaction keeps track of how many cursors were opened within
	 * it to catch application errors where the cursor isn't closed when
	 * the transaction is resolved.
	 */
	if (txn != NULL)
		++txn->cursors;
	if (ip != NULL) {
		dbc->thread_info = ip;
#ifdef DIAGNOSTIC
		if (dbc->locker != NULL)
			ip->dbth_locker =
			    R_OFFSET(&(env->lk_handle->reginfo), dbc->locker);
		else
			ip->dbth_locker = INVALID_ROFF;
#endif
	} else if (txn != NULL)
		dbc->thread_info = txn->thread_info;
	else
		ENV_GET_THREAD_INFO(env, dbc->thread_info);

	MUTEX_LOCK(env, dbp->mutex);
	TAILQ_INSERT_TAIL(&dbp->active_queue, dbc, links);
	F_SET(dbc, DBC_ACTIVE);
	MUTEX_UNLOCK(env, dbp->mutex);

	*dbcp = dbc;
	return (0);

err:	if (allocated)
		__os_free(env, dbc);
	return (ret);
}

/*
 * __db_put --
 *	Store a key/data pair.
 *
 * PUBLIC: int __db_put __P((DB *,
 * PUBLIC:      DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, u_int32_t));
 */
int
__db_put(dbp, ip, txn, key, data, flags)
	DB *dbp;
	DB_THREAD_INFO *ip;
	DB_TXN *txn;
	DBT *key, *data;
	u_int32_t flags;
{
	DB_HEAP_RID rid;
	DBC *dbc;
	DBT tdata, tkey;
	ENV *env;
	void *bulk_kptr, *bulk_ptr;
	db_recno_t recno;
	u_int32_t cursor_flags;
	int ret, t_ret;

	env = dbp->env;

	/*
	 * See the comment in __db_get() regarding DB_CURSOR_TRANSIENT.
	 *
	 * Note that the get in the DB_NOOVERWRITE case is safe to do with this
	 * flag set;  if it errors in any way other than DB_NOTFOUND, we're
	 * going to close the cursor without doing anything else, and if it
	 * returns DB_NOTFOUND then it's safe to do a c_put(DB_KEYLAST) even if
	 * an access method moved the cursor, since that's not
	 * position-dependent.
	 */
	cursor_flags = DB_WRITELOCK;
	if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY))
		cursor_flags |= DB_CURSOR_BULK;
	else
		cursor_flags |= DB_CURSOR_TRANSIENT;
	if ((ret = __db_cursor(dbp, ip, txn, &dbc, cursor_flags)) != 0)
		return (ret);

	DEBUG_LWRITE(dbc, txn, "DB->put", key, data, flags);
	PERFMON6(env, db, put, dbp->fname,
	    dbp->dname, txn == NULL ? 0 : txn->txnid, key, data, flags);

	SET_RET_MEM(dbc, dbp);

	if (flags == DB_APPEND && !DB_IS_PRIMARY(dbp)) {
		/*
		 * If there is an append callback, the value stored in
		 * data->data may be replaced and then freed.  To avoid
		 * passing a freed pointer back to the user, just operate
		 * on a copy of the data DBT.
		 */
		tdata = *data;

		/*
		 * Append isn't a normal put operation;  call the appropriate
		 * access method's append function.
		 */
		switch (dbp->type) {
		case DB_HEAP:
			if ((ret = __heap_append(dbc, key, &tdata)) != 0)
				goto err;
			break;
		case DB_QUEUE:
			if ((ret = __qam_append(dbc, key, &tdata)) != 0)
				goto err;
			break;
		case DB_RECNO:
			if ((ret = __ram_append(dbc, key, &tdata)) != 0)
				goto err;
			break;
		case DB_BTREE:
		case DB_HASH:
		case DB_UNKNOWN:
		default:
			/* The interface should prevent this. */
			DB_ASSERT(env,
			    dbp->type == DB_QUEUE || dbp->type == DB_RECNO);

			ret = __db_ferr(env, "DB->put", 0);
			goto err;
		}

		/*
		 * The append callback, if one exists, may have allocated
		 * a new tdata.data buffer.  If so, free it.
		 */
		FREE_IF_NEEDED(env, &tdata);

		/* No need for a cursor put;  we're done. */
#ifdef HAVE_COMPRESSION
	} else if (DB_IS_COMPRESSED(dbp) && !F_ISSET(dbp, DB_AM_SECONDARY) &&
	    !DB_IS_PRIMARY(dbp) && LIST_FIRST(&dbp->f_primaries) == NULL) {
		ret = __dbc_put(dbc, key, data, flags);
#endif
	} else if (LF_ISSET(DB_MULTIPLE)) {
		ret = 0;
		memset(&tkey, 0, sizeof(tkey));
		if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) {
			tkey.data = &recno;
			tkey.size = sizeof(recno);
		}
		memset(&tdata, 0, sizeof(tdata));
		DB_MULTIPLE_INIT(bulk_kptr, key);
		DB_MULTIPLE_INIT(bulk_ptr, data);
		key->doff = 0;
		while (ret == 0) {
			if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO)
				DB_MULTIPLE_RECNO_NEXT(bulk_kptr, key,
				    recno, tdata.data, tdata.size);
			else
				DB_MULTIPLE_NEXT(bulk_kptr, key,
				    tkey.data, tkey.size);
			DB_MULTIPLE_NEXT(bulk_ptr, data,
			    tdata.data, tdata.size);
			if (bulk_kptr == NULL || bulk_ptr == NULL)
				break;
			if (dbp->type == DB_HEAP) {
				memcpy(&rid, tkey.data, sizeof(DB_HEAP_RID));
				tkey.data = &rid;
			}
			ret = __dbc_put(dbc, &tkey, &tdata,
			    LF_ISSET(DB_OPFLAGS_MASK));
			if (ret == 0)
				++key->doff;
		}
	} else if (LF_ISSET(DB_MULTIPLE_KEY)) {
		ret = 0;
		memset(&tkey, 0, sizeof(tkey));
		if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) {
			tkey.data = &recno;
			tkey.size = sizeof(recno);
		}
		memset(&tdata, 0, sizeof(tdata));
		DB_MULTIPLE_INIT(bulk_ptr, key);
		while (ret == 0) {
			if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO)
				DB_MULTIPLE_RECNO_NEXT(bulk_ptr, key, recno,
				    tdata.data, tdata.size);
			else
				DB_MULTIPLE_KEY_NEXT(bulk_ptr, key, tkey.data,
				    tkey.size, tdata.data, tdata.size);
			if (bulk_ptr == NULL)
				break;
			if (dbp->type == DB_HEAP) {
				memcpy(&rid, tkey.data, sizeof(DB_HEAP_RID));
				tkey.data = &rid;
			}
			ret = __dbc_put(dbc, &tkey, &tdata,
			    LF_ISSET(DB_OPFLAGS_MASK));
			if (ret == 0)
				++key->doff;
		}
	} else
		ret = __dbc_put(dbc, key, data, flags);

err:	/* Close the cursor. */
	if (!DB_RETOK_DBPUT(ret))
		F_SET(dbc, DBC_ERROR);
	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
		ret = t_ret;

	return (ret);
}

/*
 * __db_del --
 *	Delete the items referenced by a key.
 *
 * PUBLIC: int __db_del __P((DB *,
 * PUBLIC:      DB_THREAD_INFO *, DB_TXN *, DBT *, u_int32_t));
 */
int
__db_del(dbp, ip, txn, key, flags)
	DB *dbp;
	DB_THREAD_INFO *ip;
	DB_TXN *txn;
	DBT *key;
	u_int32_t flags;
{
	DB_HEAP_RID rid;
	DBC *dbc;
	DBT data, tkey;
	void *bulk_ptr;
	db_recno_t recno;
	u_int32_t cursor_flags, f_init, f_next;
	int ret, t_ret;

	COMPQUIET(bulk_ptr, NULL);
	/* Allocate a cursor. */
	cursor_flags = DB_WRITELOCK;
	if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY))
		cursor_flags |= DB_CURSOR_BULK;
	if ((ret = __db_cursor(dbp, ip, txn, &dbc, cursor_flags)) != 0)
		return (ret);

	DEBUG_LWRITE(dbc, txn, "DB->del", key, NULL, flags);
	PERFMON5(env, db, del,
	    dbp->fname, dbp->dname, txn == NULL ? 0 : txn->txnid, key, flags);

#ifdef HAVE_COMPRESSION
	if (DB_IS_COMPRESSED(dbp) && !F_ISSET(dbp, DB_AM_SECONDARY) &&
	    !DB_IS_PRIMARY(dbp) && LIST_FIRST(&dbp->f_primaries) == NULL) {
		F_SET(dbc, DBC_TRANSIENT);
		ret = __dbc_bulk_del(dbc, key, flags);
		goto err;
	}
#endif

	/*
	 * Walk a cursor through the key/data pairs, deleting as we go.  Set
	 * the DB_DBT_USERMEM flag, as this might be a threaded application
	 * and the flags checking will catch us.  We don't actually want the
	 * keys or data, set DB_DBT_ISSET.  We rely on __dbc_get to clear
	 * this.
	 */
	memset(&data, 0, sizeof(data));
	F_SET(&data, DB_DBT_USERMEM);
	tkey = *key;

	f_init = LF_ISSET(DB_MULTIPLE_KEY) ? DB_GET_BOTH : DB_SET;
	f_next = DB_NEXT_DUP;

	/*
	 * If locking (and we haven't already acquired CDB locks), set the
	 * read-modify-write flag.
	 */
	if (STD_LOCKING(dbc)) {
		f_init |= DB_RMW;
		f_next |= DB_RMW;
	}

	if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
		if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) {
			memset(&tkey, 0, sizeof(tkey));
			tkey.data = &recno;
			tkey.size = sizeof(recno);
		}
		DB_MULTIPLE_INIT(bulk_ptr, key);
		/* We return the number of keys deleted in doff. */
		key->doff = 0;
bulk_next:	if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO)
			DB_MULTIPLE_RECNO_NEXT(bulk_ptr, key,
			    recno, data.data, data.size);
		else if (LF_ISSET(DB_MULTIPLE))
			DB_MULTIPLE_NEXT(bulk_ptr, key, tkey.data, tkey.size);
		else
			DB_MULTIPLE_KEY_NEXT(bulk_ptr, key,
			    tkey.data, tkey.size, data.data, data.size);
		if (bulk_ptr == NULL)
			goto err;
		if (dbp->type == DB_HEAP) {
			memcpy(&rid, tkey.data, sizeof(DB_HEAP_RID));
			tkey.data = &rid;
		}

	}

	/* We're not interested in the data -- do not return it. */
	F_SET(&tkey, DB_DBT_ISSET);
	F_SET(&data, DB_DBT_ISSET);

	/*
	 * Optimize the simple cases.  For all AMs if we don't have secondaries
	 * and are not a secondary and we aren't a foreign database and there
	 * are no dups then we can avoid a bunch of overhead.  For queue we
	 * don't need to fetch the record since we delete by direct calculation
	 * from the record number.
	 *
	 * Hash permits an optimization in DB->del: since on-page duplicates are
	 * stored in a single HKEYDATA structure, it's possible to delete an
	 * entire set of them at once, and as the HKEYDATA has to be rebuilt
	 * and re-put each time it changes, this is much faster than deleting
	 * the duplicates one by one.  Thus, if not pointing at an off-page
	 * duplicate set, and we're not using secondary indices (in which case
	 * we'd have to examine the items one by one anyway), let hash do this
	 * "quick delete".
	 *
	 * !!!
	 * Note that this is the only application-executed delete call in
	 * Berkeley DB that does not go through the __dbc_del function.
	 * If anything other than the delete itself (like a secondary index
	 * update) has to happen there in a particular situation, the
	 * conditions here should be modified not to use these optimizations.
	 * The ordinary AM-independent alternative will work just fine;
	 * it'll just be slower.
	 */
	if (!F_ISSET(dbp, DB_AM_SECONDARY) && !DB_IS_PRIMARY(dbp) &&
	    LIST_FIRST(&dbp->f_primaries) == NULL) {
#ifdef HAVE_QUEUE
		if (dbp->type == DB_QUEUE) {
			ret = __qam_delete(dbc, &tkey, flags);
			goto next;
		}
#endif

		/* Fetch the first record. */
		if ((ret = __dbc_get(dbc, &tkey, &data, f_init)) != 0)
			goto err;

#ifdef HAVE_HASH
		/*
		 * Hash "quick delete" removes all on-page duplicates.  We
		 * can't do that if deleting specific key/data pairs.
		 */
		if (dbp->type == DB_HASH && !LF_ISSET(DB_MULTIPLE_KEY)) {
			DBC *sdbc;
			sdbc = dbc;
#ifdef HAVE_PARTITION
			if (F_ISSET(dbc, DBC_PARTITIONED))
				sdbc =
				    ((PART_CURSOR*)dbc->internal)->sub_cursor;
#endif
			if (sdbc->internal->opd == NULL) {
				ret = __ham_quick_delete(sdbc);
				goto next;
			}
		}
#endif

		if (!F_ISSET(dbp, DB_AM_DUP)) {
			ret = dbc->am_del(dbc, 0);
			goto next;
		}
	} else if ((ret = __dbc_get(dbc, &tkey, &data, f_init)) != 0)
		goto err;

	/* Walk through the set of key/data pairs, deleting as we go. */
	for (;;) {
		if ((ret = __dbc_del(dbc, flags)) != 0)
			break;
		/*
		 * With DB_MULTIPLE_KEY, the application has specified the
		 * exact records they want deleted.  We don't need to walk
		 * through a set of duplicates.
		 */
		if (LF_ISSET(DB_MULTIPLE_KEY))
			break;

		F_SET(&tkey, DB_DBT_ISSET);
		F_SET(&data, DB_DBT_ISSET);
		if ((ret = __dbc_get(dbc, &tkey, &data, f_next)) != 0) {
			if (ret == DB_NOTFOUND)
				ret = 0;
			break;
		}
	}

next:	if (ret == 0 && LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
		++key->doff;
		goto bulk_next;
	}
err:	/* Discard the cursor. */
	if (!DB_RETOK_DBDEL(ret))
		F_SET(dbc, DBC_ERROR);
	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
		ret = t_ret;

	return (ret);
}

/*
 * __db_sync --
 *	Flush the database cache.
 *
 * PUBLIC: int __db_sync __P((DB *));
 */
int
__db_sync(dbp)
	DB *dbp;
{
	int ret, t_ret;

	ret = 0;

	/* If the database was read-only, we're done. */
	if (F_ISSET(dbp, DB_AM_RDONLY))
		return (0);

	/* If it's a Recno tree, write the backing source text file. */
	if (dbp->type == DB_RECNO)
		ret = __ram_writeback(dbp);

	/* If the database was never backed by a database file, we're done. */
	if (F_ISSET(dbp, DB_AM_INMEM))
		return (ret);
#ifdef HAVE_PARTITION
	if (DB_IS_PARTITIONED(dbp))
		ret = __partition_sync(dbp);
	else
#endif
	if (dbp->type == DB_QUEUE)
		ret = __qam_sync(dbp);
	else
		/* Flush any dirty pages from the cache to the backing file. */
		if ((t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0)
			ret = t_ret;

	return (ret);
}

/*
 * __db_associate --
 *	Associate another database as a secondary index to this one.
 *
 * PUBLIC: int __db_associate __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB *,
 * PUBLIC:     int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
 */
int
__db_associate(dbp, ip, txn, sdbp, callback, flags)
	DB *dbp, *sdbp;
	DB_THREAD_INFO *ip;
	DB_TXN *txn;
	int (*callback) __P((DB *, const DBT *, const DBT *, DBT *));
	u_int32_t flags;
{
	DBC *pdbc, *sdbc;
	DBT key, data, skey, *tskeyp;
	ENV *env;
	int build, ret, t_ret;
	u_int32_t nskey;

	env = dbp->env;
	pdbc = sdbc = NULL;
	ret = 0;

	memset(&skey, 0, sizeof(DBT));
	nskey = 0;
	tskeyp = NULL;

	/*
	 * Check to see if the secondary is empty -- and thus if we should
	 * build it -- before we link it in and risk making it show up in other
	 * threads.  Do this first so that the databases remain unassociated on
	 * error.
	 */
	build = 0;
	if (LF_ISSET(DB_CREATE)) {
		FLD_SET(sdbp->s_assoc_flags, DB_ASSOC_CREATE);

		if ((ret = __db_cursor(sdbp, ip, txn, &sdbc, 0)) != 0)
			goto err;

		/*
		 * We don't care about key or data;  we're just doing
		 * an existence check.
		 */
		memset(&key, 0, sizeof(DBT));
		memset(&data, 0, sizeof(DBT));
		F_SET(&key, DB_DBT_PARTIAL | DB_DBT_USERMEM);
		F_SET(&data, DB_DBT_PARTIAL | DB_DBT_USERMEM);
		if ((ret = __dbc_get(sdbc, &key, &data,
		    (STD_LOCKING(sdbc) ? DB_RMW : 0) |
		    DB_FIRST)) == DB_NOTFOUND) {
			build = 1;
			ret = 0;
		}

		if (ret != 0)
			F_SET(sdbc, DBC_ERROR);
		if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
			ret = t_ret;

		/* Reset for later error check. */
		sdbc = NULL;

		if (ret != 0)
			goto err;
	}

	/*
	 * Set up the database handle as a secondary.
	 */
	sdbp->s_callback = callback;
	sdbp->s_primary = dbp;

	sdbp->stored_get = sdbp->get;
	sdbp->get = __db_secondary_get;

	sdbp->stored_close = sdbp->close;
	sdbp->close = __db_secondary_close_pp;

	F_SET(sdbp, DB_AM_SECONDARY);

	if (LF_ISSET(DB_IMMUTABLE_KEY))
		FLD_SET(sdbp->s_assoc_flags, DB_ASSOC_IMMUTABLE_KEY);

	/*
	 * Add the secondary to the list on the primary.  Do it here
	 * so that we see any updates that occur while we're walking
	 * the primary.
	 */
	MUTEX_LOCK(env, dbp->mutex);

	/* See __db_s_next for an explanation of secondary refcounting. */
	DB_ASSERT(env, sdbp->s_refcnt == 0);
	sdbp->s_refcnt = 1;
	LIST_INSERT_HEAD(&dbp->s_secondaries, sdbp, s_links);
	MUTEX_UNLOCK(env, dbp->mutex);

	if (build) {
		/*
		 * We loop through the primary, putting each item we
		 * find into the new secondary.
		 *
		 * If we're using CDB, opening these two cursors puts us
		 * in a bit of a locking tangle:  CDB locks are done on the
		 * primary, so that we stay deadlock-free, but that means
		 * that updating the secondary while we have a read cursor
		 * open on the primary will self-block.  To get around this,
		 * we force the primary cursor to use the same locker ID
		 * as the secondary, so they won't conflict.  This should
		 * be harmless even if we're not using CDB.
		 */
		if ((ret = __db_cursor(sdbp, ip, txn, &sdbc,
		    CDB_LOCKING(sdbp->env) ? DB_WRITECURSOR : 0)) != 0)
			goto err;
		if ((ret = __db_cursor_int(dbp, ip,
		    txn, dbp->type, PGNO_INVALID, 0, sdbc->locker, &pdbc)) != 0)
			goto err;

		/* Lock out other threads, now that we have a locker. */
		dbp->associate_locker = sdbc->locker;

		memset(&key, 0, sizeof(DBT));
		memset(&data, 0, sizeof(DBT));
		while ((ret = __dbc_get(pdbc, &key, &data, DB_NEXT)) == 0) {
			if ((ret = callback(sdbp, &key, &data, &skey)) != 0) {
				if (ret == DB_DONOTINDEX)
					continue;
				goto err;
			}
			if (F_ISSET(&skey, DB_DBT_MULTIPLE)) {
#ifdef DIAGNOSTIC
				__db_check_skeyset(sdbp, &skey);
#endif
				nskey = skey.size;
				tskeyp = (DBT *)skey.data;
			} else {
				nskey = 1;
				tskeyp = &skey;
			}
			SWAP_IF_NEEDED(sdbp, &key);
			for (; nskey > 0; nskey--, tskeyp++) {
				if ((ret = __dbc_put(sdbc,
				    tskeyp, &key, DB_UPDATE_SECONDARY)) != 0)
					goto err;
				FREE_IF_NEEDED(env, tskeyp);
			}
			SWAP_IF_NEEDED(sdbp, &key);
			FREE_IF_NEEDED(env, &skey);
		}
		if (ret == DB_NOTFOUND)
			ret = 0;
	}

err:	if (sdbc != NULL && (t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
		ret = t_ret;

	if (pdbc != NULL && (t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
		ret = t_ret;

	dbp->associate_locker = NULL;

	for (; nskey > 0; nskey--, tskeyp++)
		FREE_IF_NEEDED(env, tskeyp);
	FREE_IF_NEEDED(env, &skey);

	return (ret);
}

/*
 * __db_secondary_get --
 *	This wrapper function for DB->pget() is the DB->get() function
 *	on a database which has been made into a secondary index.
 */
static int
__db_secondary_get(sdbp, txn, skey, data, flags)
	DB *sdbp;
	DB_TXN *txn;
	DBT *skey, *data;
	u_int32_t flags;
{
	DB_ASSERT(sdbp->env, F_ISSET(sdbp, DB_AM_SECONDARY));
	return (__db_pget_pp(sdbp, txn, skey, NULL, data, flags));
}

/*
 * __db_secondary_close --
 *	Wrapper function for DB->close() which we use on secondaries to
 *	manage refcounting and make sure we don't close them underneath
 *	a primary that is updating.
 *
 * PUBLIC: int __db_secondary_close __P((DB *, u_int32_t));
 */
int
__db_secondary_close(sdbp, flags)
	DB *sdbp;
	u_int32_t flags;
{
	DB *primary;
	ENV *env;
	int doclose;

	/*
	 * If the opening transaction is rolled back then the db handle
	 * will have already been refreshed, we just need to call
	 * __db_close to free the data.
	 */
	if (!F_ISSET(sdbp, DB_AM_OPEN_CALLED)) {
		doclose = 1;
		goto done;
	}
	doclose = 0;
	primary = sdbp->s_primary;
	env = primary->env;

	MUTEX_LOCK(env, primary->mutex);
	/*
	 * Check the refcount--if it was at 1 when we were called, no
	 * thread is currently updating this secondary through the primary,
	 * so it's safe to close it for real.
	 *
	 * If it's not safe to do the close now, we do nothing;  the
	 * database will actually be closed when the refcount is decremented,
	 * which can happen in either __db_s_next or __db_s_done.
	 */
	DB_ASSERT(env, sdbp->s_refcnt != 0);
	if (--sdbp->s_refcnt == 0) {
		LIST_REMOVE(sdbp, s_links);
		/* We don't want to call close while the mutex is held. */
		doclose = 1;
	}
	MUTEX_UNLOCK(env, primary->mutex);

	/*
	 * sdbp->close is this function;  call the real one explicitly if
	 * need be.
	 */
done:	return (doclose ? __db_close(sdbp, NULL, flags) : 0);
}

/*
 * __db_associate_foreign --
 *	Associate this database (fdbp) as a foreign constraint to another
 *	database (pdbp).  That is, dbp's keys appear as foreign key values in
 *	pdbp.
 *
 * PUBLIC: int __db_associate_foreign __P((DB *, DB *,
 * PUBLIC:     int (*)(DB *, const DBT *, DBT *, const DBT *, int *),
 * PUBLIC:     u_int32_t));
 */
int
__db_associate_foreign(fdbp, pdbp, callback, flags)
	DB *fdbp, *pdbp;
	int (*callback)(DB *, const DBT *, DBT *, const DBT *, int *);
	u_int32_t flags;
{
	DB_FOREIGN_INFO *f_info;
	ENV *env;
	int ret;

	env = fdbp->env;
	ret = 0;

	if ((ret = __os_malloc(env, sizeof(DB_FOREIGN_INFO), &f_info)) != 0) {
		return (ret);
	}
	memset(f_info, 0, sizeof(DB_FOREIGN_INFO));

	f_info->dbp = pdbp;
	f_info->callback = callback;

	/*
	 * It might be wise to filter this, but for now the flags only
	 * set the delete action type.
	 */
	FLD_SET(f_info->flags, flags);

	/*
	 * Add f_info to the foreign database's list of primaries.  That is to
	 * say, fdbp->f_primaries lists all databases for which fdbp is a
	 * foreign constraint.
	 */
	MUTEX_LOCK(env, fdbp->mutex);
	LIST_INSERT_HEAD(&fdbp->f_primaries, f_info, f_links);
	MUTEX_UNLOCK(env, fdbp->mutex);

	/*
	* Associate fdbp as pdbp's foreign db, for referential integrity
	* checks.  We don't allow the foreign db to be changed, because we
	* currently have no way of removing pdbp from the old foreign db's list
	* of primaries.
	*/
	if (pdbp->s_foreign != NULL)
		return (EINVAL);
	pdbp->s_foreign = fdbp;

	return (ret);
}

static int
__dbc_set_priority(dbc, priority)
	DBC *dbc;
	DB_CACHE_PRIORITY priority;
{
	dbc->priority = priority;
	return (0);
}

static int
__dbc_get_priority(dbc, priority)
	DBC *dbc;
	DB_CACHE_PRIORITY *priority;
{
	if (dbc->priority == DB_PRIORITY_UNCHANGED)
		return (__memp_get_priority(dbc->dbp->mpf, priority));
	else
		*priority = dbc->priority;

	return (0);
}