1 /*-
2  * Copyright (c) 1996, 2020 Oracle and/or its affiliates.  All rights reserved.
3  *
4  * See the file LICENSE for license information.
5  */
6 /*
7  * Copyright (c) 1995, 1996
8  *	The President and Fellows of Harvard University.  All rights reserved.
9  *
10  * This code is derived from software contributed to Berkeley by
11  * Margo Seltzer.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  * $Id$
38  */
39 
40 #include "db_config.h"
41 
42 #include "db_int.h"
43 #include "dbinc/db_page.h"
44 #include "dbinc/hash.h"
45 #include "dbinc/fop.h"
46 #include "dbinc/lock.h"
47 #include "dbinc/mp.h"
48 #include "dbinc/txn.h"
49 #include "dbinc/log_verify.h"
50 
51 static int __db_txnlist_find_internal __P((ENV *, DB_TXNHEAD *,
52 		db_txnlist_type, u_int32_t,  DB_TXNLIST **,
53 		int, u_int32_t *));
54 
55 /*
56  * __db_dispatch --
57  *	Decide whether a given log record should be applied for a given
58  *	"recovery" mode.  If so, call the corresponding function in the
59  *	DB_DISTAB dispatch table.
60  *
61  * Parameters:
62  *	dtab   - a dispatch table mapping the logrec type to the function to
63  *		 call.  The normal one is built into env->recover_dtab.
64  *	logrec - a transaction log record as a byte stream, in a DBT.  This
65  *		 describes the data to dispatch.
66  *	lsnp   - logrec's LSN
67  *	recop  - what to do; usually one of recovery's steps in applying or
68  *		 undoing a txn; the others are DB_TXN_PRINT & DB_TXN_LOG_VERIFY.
69  *	params - A pointer to pass to the function.  It usually is a DB_TXNHEAD,
70  *		 but is DB_LOG when printing or DB_LOG_VERIFY for verify.
71  *
72  * PUBLIC: int __db_dispatch __P((ENV *,
73  * PUBLIC:     DB_DISTAB *, DBT *, DB_LSN *, db_recops, void *));
74  */
75 int
__db_dispatch(env,dtab,logrec,lsnp,recop,params)76 __db_dispatch(env, dtab, logrec, lsnp, recop, params)
77 	ENV *env;
78 	DB_DISTAB *dtab;
79 	DBT *logrec;		/* The log record upon which to dispatch. */
80 	DB_LSN *lsnp;		/* The lsn of the record being dispatched. */
81 	db_recops recop;	/* Redo this op (or undo it). */
82 	void *params;
83 {
84 	DB_ENV *dbenv;
85 	DB_TXNHEAD *txninfo;
86 	DB_LOG_VRFY_INFO *lvh;
87 	DB_LSN prev_lsn;
88 	u_int32_t rectype, status, txnid, urectype;
89 	int make_call, ret;
90 
91 	dbenv = env->dbenv;
92 	make_call = ret = 0;
93 	lvh = NULL;
94 	txninfo = NULL;
95 	LOGCOPY_32(env, &rectype, logrec->data);
96 	LOGCOPY_32(env, &txnid, (u_int8_t *)logrec->data + sizeof(rectype));
97 
98 	/*
99 	 * Log verification passes a DB_LOG_VRFY_INFO structure, log print
100 	 * passes in a DB_LOG, everyone else passes a DB_TXNHEAD structure.
101 	 */
102 	if (recop != DB_TXN_LOG_VERIFY)
103 		txninfo = (DB_TXNHEAD *)params;
104 	else
105 		lvh = (DB_LOG_VRFY_INFO *)params;
106 
107 	/* If we don't have a dispatch table, it's hard to dispatch. */
108 	DB_ASSERT(env, dtab != NULL);
109 
110 	/*
111 	 * If we find a record that is in the user's number space and they
112 	 * have specified a recovery routine, let them handle it.  If they
113 	 * didn't specify a recovery routine, then we expect that they've
114 	 * followed all our rules and registered new recovery functions.
115 	 */
116 	switch (recop) {
117 	case DB_TXN_ABORT:
118 	case DB_TXN_APPLY:
119 	case DB_TXN_LOG_VERIFY:
120 	case DB_TXN_PRINT:
121 		make_call = 1;
122 		break;
123 	case DB_TXN_OPENFILES:
124 		/*
125 		 * We collect all the transactions that have
126 		 * "begin" records, those with no previous LSN,
127 		 * so that we do not abort partial transactions.
128 		 * These are known to be undone, otherwise the
129 		 * log would not have been freeable.
130 		 */
131 		LOGCOPY_TOLSN(env, &prev_lsn, (u_int8_t *)logrec->data +
132 		    sizeof(rectype) + sizeof(txnid));
133 		if (txnid != 0 && prev_lsn.file == 0 && (ret =
134 		    __db_txnlist_add(env, txninfo, txnid, TXN_OK, NULL)) != 0)
135 			return (ret);
136 
137 		/* FALLTHROUGH */
138 	case DB_TXN_POPENFILES:
139 		if (rectype == DB___dbreg_register ||
140 		    rectype == DB___txn_child ||
141 		    rectype == DB___txn_ckp || rectype == DB___txn_recycle)
142 			return ((dtab->int_dispatch[rectype])(env,
143 			    logrec, lsnp, recop, params));
144 		break;
145 	case DB_TXN_BACKWARD_ROLL:
146 		/*
147 		 * Running full recovery in the backward pass.  In general,
148 		 * we only process records during this pass that belong
149 		 * to aborted transactions.  Unfortunately, there are several
150 		 * exceptions:
151 		 * 1. If this is a meta-record, one not associated with
152 		 *    a transaction, then we must always process it.
153 		 * 2. If this is a transaction commit/abort, we must
154 		 *    always process it, so that we know the status of
155 		 *    every transaction.
156 		 * 3. If this is a child commit, we need to process it
157 		 *    because the outcome of the child transaction depends
158 		 *    on the outcome of the parent.
159 		 * 4. If this is a dbreg_register record, we must always
160 		 *    process is because they contain non-transactional
161 		 *    closes that must be properly handled.
162 		 * 5. If this is a noop, we must always undo it so that we
163 		 *    properly handle any aborts before a file was closed.
164 		 * 6. If this a file remove, we need to process it to
165 		 *    determine if the on-disk file is the same as the
166 		 *    one being described.
167 		 */
168 		switch (rectype) {
169 		/*
170 		 * These either do not belong to a transaction or (regop)
171 		 * must be processed regardless of the status of the
172 		 * transaction.
173 		 */
174 		case DB___txn_regop:
175 		case DB___txn_recycle:
176 		case DB___txn_ckp:
177 			make_call = 1;
178 			break;
179 		/*
180 		 * These belong to a transaction whose status must be
181 		 * checked.
182 		 */
183 		case DB___txn_child:
184 		case DB___db_noop:
185 		case DB___fop_file_remove:
186 		case DB___dbreg_register:
187 			make_call = 1;
188 
189 			/* FALLTHROUGH */
190 		default:
191 			if (txnid == 0)
192 				break;
193 
194 			ret = __db_txnlist_find(env, txninfo, txnid, &status);
195 
196 			/* If not found, this is an incomplete abort.  */
197 			if (ret == DB_NOTFOUND)
198 				return (__db_txnlist_add(env,
199 				    txninfo, txnid, TXN_IGNORE, lsnp));
200 			if (ret != 0)
201 				return (ret);
202 
203 			/*
204 			 * If we ignore the transaction, ignore the operation
205 			 * UNLESS this is a child commit in which case we need
206 			 * to make sure that the child also gets marked as
207 			 * ignore.
208 			 */
209 			if (status == TXN_IGNORE && rectype != DB___txn_child) {
210 				make_call = 0;
211 				break;
212 			}
213 			if (status == TXN_COMMIT)
214 				break;
215 
216 			/* Set make_call in case we came through default */
217 			make_call = 1;
218 			if (status == TXN_OK &&
219 			    (ret = __db_txnlist_update(env,
220 			    txninfo, txnid, rectype == DB___txn_prepare ?
221 			    TXN_PREPARE : TXN_ABORT, NULL, &status, 0)) != 0)
222 				return (ret);
223 		}
224 		break;
225 	case DB_TXN_FORWARD_ROLL:
226 		/*
227 		 * In the forward pass, if we haven't seen the transaction,
228 		 * do nothing, else recover it.
229 		 *
230 		 * We need to always redo DB___db_noop records, so that we
231 		 * properly handle any commits after the file was closed.
232 		 */
233 		switch (rectype) {
234 		case DB___txn_recycle:
235 		case DB___txn_ckp:
236 		case DB___db_noop:
237 		case DB___dbreg_register:
238 			make_call = 1;
239 			break;
240 
241 		default:
242 			if (txnid == 0)
243 				status = 0;
244 			else {
245 				ret = __db_txnlist_find(env,
246 				    txninfo, txnid, &status);
247 
248 				if (ret == DB_NOTFOUND)
249 					/* Break out of if clause. */
250 					;
251 				else if (ret != 0)
252 					return (ret);
253 				else if (status == TXN_COMMIT) {
254 					make_call = 1;
255 					break;
256 				}
257 			}
258 
259 		}
260 		break;
261 	default:
262 		return (__db_unknown_flag(
263 		    env, "__db_dispatch", (u_int32_t)recop));
264 	}
265 
266 	if (make_call) {
267 		/*
268 		 * If the debug flag is set then we are logging
269 		 * records for a non-durable update so that they
270 		 * may be examined for diagnostic purposes.
271 		 * So only make the call if we are printing,
272 		 * otherwise we need to extract the previous
273 		 * lsn so undo will work properly.
274 		 */
275 		if (rectype & DB_debug_FLAG) {
276 			if (recop == DB_TXN_PRINT)
277 				rectype &= ~DB_debug_FLAG;
278 			else {
279 				LOGCOPY_TOLSN(env, lsnp,
280 				    (u_int8_t *)logrec->data +
281 				    sizeof(rectype) + sizeof(txnid));
282 				return (0);
283 			}
284 		}
285 		if (rectype >= DB_user_BEGIN) {
286 			/*
287 			 * Increment user log count, we can't pass any extra
288 			 * args into app_dispatch, so this has to be done here.
289 			 */
290 			if (recop == DB_TXN_LOG_VERIFY)
291 				lvh->external_logrec_cnt++;
292 			if (dbenv->app_dispatch != NULL)
293 				return (dbenv->app_dispatch(dbenv,
294 				    logrec, lsnp, recop));
295 
296 			/* No application-specific dispatch */
297 			urectype = rectype - DB_user_BEGIN;
298 			if (urectype > dtab->ext_size ||
299 			    dtab->ext_dispatch[urectype] == NULL) {
300 				__db_errx(env, DB_STR_A("0512",
301 	    "Illegal application-specific record type %lu in log",
302 				    "%lu"), (u_long)rectype);
303 				return (EINVAL);
304 			}
305 
306 			return ((dtab->ext_dispatch[urectype])(dbenv,
307 			    logrec, lsnp, recop));
308 		} else {
309 			if (rectype > dtab->int_size ||
310 			    dtab->int_dispatch[rectype] == NULL) {
311 				__db_errx(env, DB_STR_A("0513",
312 				    "Illegal record type %lu in log", "%lu"),
313 				    (u_long)rectype);
314 				if (recop == DB_TXN_LOG_VERIFY)
315 					lvh->unknown_logrec_cnt++;
316 
317 				return (EINVAL);
318 			}
319 
320 			return ((dtab->int_dispatch[rectype])(env,
321 			    logrec, lsnp, recop, params));
322 		}
323 	}
324 
325 	return (0);
326 }
327 
328 /*
329  * __db_add_recovery -- Add recovery functions to the dispatch table.
330  *
331  *	This is an undocumented public API entry point.
332  *
333  * We have two versions of this,  an external one and an internal one,
334  * because application-specific functions take different arguments
335  * for dispatch (ENV versus DB_ENV).
336  *
337  * PUBLIC: int __db_add_recovery __P((DB_ENV *, DB_DISTAB *,
338  * PUBLIC:   int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops), u_int32_t));
339  */
340 int
__db_add_recovery(dbenv,dtab,func,ndx)341 __db_add_recovery(dbenv, dtab, func, ndx)
342 	DB_ENV *dbenv;
343 	DB_DISTAB *dtab;
344 	int (*func) __P((DB_ENV *, DBT *, DB_LSN *, db_recops));
345 	u_int32_t ndx;
346 {
347 	size_t i, newsize;
348 	int ret;
349 
350 	/* Make sure this is an application-specific record. */
351 	if (ndx < DB_user_BEGIN) {
352 		__db_errx(dbenv->env, DB_STR_A("0514",
353 	"Attempting to add application-specific record with invalid type %lu",
354 		    "%lu"), (u_long)ndx);
355 		return (EINVAL);
356 	}
357 	ndx -= DB_user_BEGIN;
358 
359 	/* Check if we have to grow the table. */
360 	if (ndx >= dtab->ext_size) {
361 		newsize = ndx + 40;
362 		if ((ret = __os_realloc(dbenv->env, newsize *
363 		    sizeof((dtab->ext_dispatch)[0]), &dtab->ext_dispatch)) != 0)
364 			return (ret);
365 		for (i = dtab->ext_size; i < newsize; ++i)
366 			(dtab->ext_dispatch)[i] = NULL;
367 		dtab->ext_size = newsize;
368 	}
369 
370 	(dtab->ext_dispatch)[ndx] = func;
371 	return (0);
372 }
373 
374 /*
375  * __db_add_recovery_int --
376  *
377  * This, the internal version of dispatch addition function, takes a function
378  * that has an ENV (rather than a DB_ENV) and a extra void * parameter that is
379  * passed through __db_dispatch() to the function.  It would be useful for the
380  * external version to also support a void *, but it wasn't thought of in time.
381  *
382  *	In current practice, the extra parameter always is a DB_TXNHEAD *.
383  *
384  * PUBLIC: int __db_add_recovery_int __P((ENV *, DB_DISTAB *,
385  * PUBLIC:   int (*)(ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t));
386  */
387 int
__db_add_recovery_int(env,dtab,func,ndx)388 __db_add_recovery_int(env, dtab, func, ndx)
389 	ENV *env;
390 	DB_DISTAB *dtab;
391 	int (*func) __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
392 	u_int32_t ndx;
393 {
394 	size_t i, newsize;
395 	int ret;
396 
397 	if (ndx >= DB_user_BEGIN) {
398 		__db_errx(env, DB_STR_A("0515",
399 		    "Attempting to add internal record with invalid type %lu",
400 		    "%lu"), (u_long)ndx);
401 		return (EINVAL);
402 	}
403 
404 	/* Check if we have to grow the table. */
405 	if (ndx >= dtab->int_size) {
406 		newsize = ndx + 40;
407 		if ((ret = __os_realloc(env, newsize *
408 		    sizeof((dtab->int_dispatch)[0]), &dtab->int_dispatch)) != 0)
409 			return (ret);
410 		for (i = dtab->int_size; i < newsize; ++i)
411 			(dtab->int_dispatch)[i] = NULL;
412 		dtab->int_size = newsize;
413 	}
414 
415 	(dtab->int_dispatch)[ndx] = func;
416 	return (0);
417 }
418 
419 /*
420  * __db_txnlist_init --
421  *	Initialize transaction linked list.
422  *
423  * PUBLIC: int __db_txnlist_init __P((ENV *, DB_THREAD_INFO *,
424  * PUBLIC:     u_int32_t, u_int32_t, DB_LSN *, DB_TXNHEAD **));
425  */
426 int
__db_txnlist_init(env,ip,low_txn,hi_txn,trunc_lsn,retp)427 __db_txnlist_init(env, ip, low_txn, hi_txn, trunc_lsn, retp)
428 	ENV *env;
429 	DB_THREAD_INFO *ip;
430 	u_int32_t low_txn, hi_txn;
431 	DB_LSN *trunc_lsn;
432 	DB_TXNHEAD **retp;
433 {
434 	DB_TXNHEAD *headp;
435 	u_int32_t size, tmp;
436 	int ret;
437 
438 	/*
439 	 * Size a hash table.
440 	 *	If low is zero then we are being called during rollback
441 	 * and we need only one slot.
442 	 *	Hi maybe lower than low if we have recycled txnid's.
443 	 *	The numbers here are guesses about txn density, we can afford
444 	 * to look at a few entries in each slot.
445 	 */
446 	if (low_txn == 0)
447 		size = 1;
448 	else {
449 		if (hi_txn < low_txn) {
450 			tmp = hi_txn;
451 			hi_txn = low_txn;
452 			low_txn = tmp;
453 		}
454 		tmp = hi_txn - low_txn;
455 		/* See if we wrapped around. */
456 		if (tmp > (TXN_MAXIMUM - TXN_MINIMUM) / 2)
457 			tmp = (low_txn - TXN_MINIMUM) + (TXN_MAXIMUM - hi_txn);
458 		size = tmp / 5;
459 		if (size < 100)
460 			size = 100;
461 	}
462 	if ((ret = __os_malloc(env,
463 	    sizeof(DB_TXNHEAD) + size * sizeof(headp->head), &headp)) != 0)
464 		return (ret);
465 
466 	memset(headp, 0, sizeof(DB_TXNHEAD) + size * sizeof(headp->head));
467 	headp->maxid = hi_txn;
468 	headp->generation = 0;
469 	headp->nslots = size;
470 	headp->gen_alloc = 8;
471 	headp->thread_info = ip;
472 	if ((ret = __os_malloc(env, headp->gen_alloc *
473 	    sizeof(headp->gen_array[0]), &headp->gen_array)) != 0) {
474 		__os_free(env, headp);
475 		return (ret);
476 	}
477 	headp->gen_array[0].generation = 0;
478 	headp->gen_array[0].txn_min = TXN_MINIMUM;
479 	headp->gen_array[0].txn_max = TXN_MAXIMUM;
480 	if (trunc_lsn != NULL) {
481 		headp->trunc_lsn = *trunc_lsn;
482 		headp->maxlsn = *trunc_lsn;
483 	} else {
484 		ZERO_LSN(headp->trunc_lsn);
485 		ZERO_LSN(headp->maxlsn);
486 	}
487 	ZERO_LSN(headp->ckplsn);
488 
489 	*retp = headp;
490 	return (0);
491 }
492 
493 #define	FIND_GENERATION(hp, txnid, gen) do {				\
494 	u_int32_t __i;							\
495 	for (__i = 0; __i <= (hp)->generation; __i++)			\
496 		/* The range may wrap around the end. */		\
497 		if ((hp)->gen_array[__i].txn_min <			\
498 		    (hp)->gen_array[__i].txn_max ?			\
499 		    ((txnid) >= (hp)->gen_array[__i].txn_min &&		\
500 		    (txnid) <= (hp)->gen_array[__i].txn_max) :		\
501 		    ((txnid) >= (hp)->gen_array[__i].txn_min ||		\
502 		    (txnid) <= (hp)->gen_array[__i].txn_max))		\
503 			break;						\
504 	DB_ASSERT(env, __i <= (hp)->generation);			\
505 	gen = (hp)->gen_array[__i].generation;				\
506 } while (0)
507 
508 /*
509  * __db_txnlist_add --
510  *	Add an element to our transaction linked list.
511  *
512  * PUBLIC: int __db_txnlist_add __P((ENV *,
513  * PUBLIC:     DB_TXNHEAD *, u_int32_t, u_int32_t, DB_LSN *));
514  */
515 int
__db_txnlist_add(env,hp,txnid,status,lsn)516 __db_txnlist_add(env, hp, txnid, status, lsn)
517 	ENV *env;
518 	DB_TXNHEAD *hp;
519 	u_int32_t txnid, status;
520 	DB_LSN *lsn;
521 {
522 	DB_TXNLIST *elp;
523 	int ret;
524 
525 	if ((ret = __os_malloc(env, sizeof(DB_TXNLIST), &elp)) != 0)
526 		return (ret);
527 
528 	LIST_INSERT_HEAD(&hp->head[DB_TXNLIST_MASK(hp, txnid)], elp, links);
529 
530 	/* Find the most recent generation containing this ID */
531 	FIND_GENERATION(hp, txnid, elp->u.t.generation);
532 	elp->type = TXNLIST_TXNID;
533 	elp->u.t.txnid = txnid;
534 	elp->u.t.status = status;
535 	if (txnid > hp->maxid)
536 		hp->maxid = txnid;
537 	if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
538 		hp->maxlsn = *lsn;
539 
540 	DB_ASSERT(env, lsn == NULL ||
541 	    status != TXN_COMMIT || LOG_COMPARE(&hp->maxlsn, lsn) >= 0);
542 
543 	return (0);
544 }
545 
546 /*
547  * __db_txnlist_remove --
548  *	Remove an element from our transaction linked list.
549  *
550  * PUBLIC: int __db_txnlist_remove __P((ENV *, DB_TXNHEAD *, u_int32_t));
551  */
552 int
__db_txnlist_remove(env,hp,txnid)553 __db_txnlist_remove(env, hp, txnid)
554 	ENV *env;
555 	DB_TXNHEAD *hp;
556 	u_int32_t txnid;
557 {
558 	DB_TXNLIST *entry;
559 	u_int32_t status;
560 
561 	return (__db_txnlist_find_internal(env,
562 	    hp, TXNLIST_TXNID, txnid, &entry, 1, &status));
563 }
564 
565 /*
566  * __db_txnlist_ckp --
567  *	Used to record the maximum checkpoint that will be retained
568  * after recovery.  Typically this is simply the max checkpoint, but
569  * if we are doing client replication recovery or timestamp-based
570  * recovery, we are going to virtually truncate the log and we need
571  * to retain the last checkpoint before the truncation point.
572  *
573  * PUBLIC: void __db_txnlist_ckp __P((ENV *, DB_TXNHEAD *, DB_LSN *));
574  */
575 void
__db_txnlist_ckp(env,hp,ckp_lsn)576 __db_txnlist_ckp(env, hp, ckp_lsn)
577 	ENV *env;
578 	DB_TXNHEAD *hp;
579 	DB_LSN *ckp_lsn;
580 {
581 
582 	COMPQUIET(env, NULL);
583 
584 	if (IS_ZERO_LSN(hp->ckplsn) && !IS_ZERO_LSN(hp->maxlsn) &&
585 	    LOG_COMPARE(&hp->maxlsn, ckp_lsn) >= 0)
586 		hp->ckplsn = *ckp_lsn;
587 }
588 
589 /*
590  * __db_txnlist_end --
591  *	Discard transaction linked list.
592  *
593  * PUBLIC: void __db_txnlist_end __P((ENV *, DB_TXNHEAD *));
594  */
595 void
__db_txnlist_end(env,hp)596 __db_txnlist_end(env, hp)
597 	ENV *env;
598 	DB_TXNHEAD *hp;
599 {
600 	u_int32_t i;
601 	DB_TXNLIST *p;
602 
603 	if (hp == NULL)
604 		return;
605 
606 	for (i = 0; i < hp->nslots; i++)
607 		while (hp != NULL && (p = LIST_FIRST(&hp->head[i])) != NULL) {
608 			switch (p->type) {
609 			case TXNLIST_LSN:
610 				__os_free(env, p->u.l.lsn_stack);
611 				break;
612 			case TXNLIST_DELETE:
613 			case TXNLIST_TXNID:
614 			default:
615 				/*
616 				 * Possibly an incomplete DB_TXNLIST; just
617 				 * free it.
618 				 */
619 				break;
620 			}
621 			LIST_REMOVE(p, links);
622 			__os_free(env, p);
623 		}
624 
625 	if (hp->gen_array != NULL)
626 		__os_free(env, hp->gen_array);
627 	__os_free(env, hp);
628 }
629 
630 /*
631  * __db_txnlist_find --
632  *	Checks to see if a txnid with the current generation is in the
633  *	txnid list.  This returns DB_NOTFOUND if the item isn't in the
634  *	list otherwise it returns (like __db_txnlist_find_internal)
635  *	the status of the transaction.  A txnid of 0 means the record
636  *	was generated while not in a transaction.
637  *
638  * PUBLIC: int __db_txnlist_find __P((ENV *,
639  * PUBLIC:     DB_TXNHEAD *, u_int32_t, u_int32_t *));
640  */
641 int
__db_txnlist_find(env,hp,txnid,statusp)642 __db_txnlist_find(env, hp, txnid, statusp)
643 	ENV *env;
644 	DB_TXNHEAD *hp;
645 	u_int32_t txnid, *statusp;
646 {
647 	DB_TXNLIST *entry;
648 
649 	if (txnid == 0)
650 		return (USR_ERR(env, DB_NOTFOUND));
651 
652 	return (__db_txnlist_find_internal(env, hp,
653 	    TXNLIST_TXNID, txnid, &entry, 0, statusp));
654 }
655 
656 /*
657  * __db_txnlist_update --
658  *	Change the status of an existing transaction entry.
659  *	Returns DB_NOTFOUND if no such entry exists.
660  *
661  * PUBLIC: int __db_txnlist_update __P((ENV *, DB_TXNHEAD *,
662  * PUBLIC:     u_int32_t, u_int32_t, DB_LSN *, u_int32_t *, int));
663  */
664 int
__db_txnlist_update(env,hp,txnid,status,lsn,ret_status,add_ok)665 __db_txnlist_update(env, hp, txnid, status, lsn, ret_status, add_ok)
666 	ENV *env;
667 	DB_TXNHEAD *hp;
668 	u_int32_t txnid, status;
669 	DB_LSN *lsn;
670 	u_int32_t *ret_status;
671 	int add_ok;
672 {
673 	DB_TXNLIST *elp;
674 	int ret;
675 
676 	if (txnid == 0)
677 		return (USR_ERR(env, DB_NOTFOUND));
678 
679 	ret = __db_txnlist_find_internal(env,
680 	    hp, TXNLIST_TXNID, txnid, &elp, 0, ret_status);
681 
682 	if (ret == DB_NOTFOUND && add_ok) {
683 		*ret_status = status;
684 		return (__db_txnlist_add(env, hp, txnid, status, lsn));
685 	}
686 	if (ret != 0)
687 		return (ret);
688 
689 	if (*ret_status == TXN_IGNORE)
690 		return (0);
691 
692 	elp->u.t.status = status;
693 
694 	if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
695 		hp->maxlsn = *lsn;
696 
697 	return (ret);
698 }
699 
700 /*
701  * __db_txnlist_find_internal --
702  *	Find an entry on the transaction list.  If the entry is not there or
703  *	the list pointer is not initialized we return DB_NOTFOUND.  If the
704  *	item is found, we return the status.  Currently we always call this
705  *	with an initialized list pointer but checking for NULL keeps it general.
706  */
707 static int
__db_txnlist_find_internal(env,hp,type,txnid,txnlistp,del,statusp)708 __db_txnlist_find_internal(env,
709     hp, type, txnid, txnlistp, del, statusp)
710 	ENV *env;
711 	DB_TXNHEAD *hp;
712 	db_txnlist_type type;
713 	u_int32_t  txnid;
714 	DB_TXNLIST **txnlistp;
715 	int del;
716 	u_int32_t *statusp;
717 {
718 	struct __db_headlink *head;
719 	DB_TXNLIST *p;
720 	u_int32_t generation, hash;
721 	int ret;
722 
723 	ret = 0;
724 
725 	if (hp == NULL)
726 		return (USR_ERR(env, DB_NOTFOUND));
727 
728 	switch (type) {
729 	case TXNLIST_TXNID:
730 		hash = txnid;
731 		FIND_GENERATION(hp, txnid, generation);
732 		break;
733 	case TXNLIST_DELETE:
734 	case TXNLIST_LSN:
735 	default:
736 		return (__env_panic(env, EINVAL));
737 	}
738 
739 	head = &hp->head[DB_TXNLIST_MASK(hp, hash)];
740 	LIST_FOREACH(p, head, links) {
741 		if (p->type != type)
742 			continue;
743 		switch (type) {
744 		case TXNLIST_TXNID:
745 			if (p->u.t.txnid != txnid ||
746 			    generation != p->u.t.generation)
747 				continue;
748 			*statusp = p->u.t.status;
749 			break;
750 
751 		case TXNLIST_DELETE:
752 		case TXNLIST_LSN:
753 		default:
754 			return (__env_panic(env, EINVAL));
755 		}
756 		if (del == 1) {
757 			LIST_REMOVE(p, links);
758 			__os_free(env, p);
759 			*txnlistp = NULL;
760 		} else if (p != LIST_FIRST(head)) {
761 			/* Move it to head of list. */
762 			LIST_REMOVE(p, links);
763 			LIST_INSERT_HEAD(head, p, links);
764 			*txnlistp = p;
765 		} else
766 			*txnlistp = p;
767 		return (ret);
768 	}
769 
770 	return (USR_ERR(env, DB_NOTFOUND));
771 }
772 
773 /*
774  * __db_txnlist_gen --
775  *	Change the current generation number.
776  *
777  * PUBLIC: int __db_txnlist_gen __P((ENV *,
778  * PUBLIC:       DB_TXNHEAD *, int, u_int32_t, u_int32_t));
779  */
780 int
__db_txnlist_gen(env,hp,incr,min,max)781 __db_txnlist_gen(env, hp, incr, min, max)
782 	ENV *env;
783 	DB_TXNHEAD *hp;
784 	int incr;
785 	u_int32_t min, max;
786 {
787 	int ret;
788 
789 	/*
790 	 * During recovery generation numbers keep track of "restart"
791 	 * checkpoints and recycle records.  Restart checkpoints occur
792 	 * whenever we take a checkpoint and there are no outstanding
793 	 * transactions.  When that happens, we can reset transaction IDs
794 	 * back to TXNID_MINIMUM.  Currently we only do the reset
795 	 * at then end of recovery.  Recycle records occur when txnids
796 	 * are exhausted during runtime.  A free range of ids is identified
797 	 * and logged.  This code maintains a stack of ranges.  A txnid
798 	 * is given the generation number of the first range it falls into
799 	 * in the stack.
800 	 */
801 	if (incr < 0) {
802 		--hp->generation;
803 		memmove(hp->gen_array, &hp->gen_array[1],
804 		    (hp->generation + 1) * sizeof(hp->gen_array[0]));
805 	} else {
806 		++hp->generation;
807 		if (hp->generation >= hp->gen_alloc) {
808 			hp->gen_alloc *= 2;
809 			if ((ret = __os_realloc(env, hp->gen_alloc *
810 			    sizeof(hp->gen_array[0]), &hp->gen_array)) != 0)
811 				return (ret);
812 		}
813 		memmove(&hp->gen_array[1], &hp->gen_array[0],
814 		    hp->generation * sizeof(hp->gen_array[0]));
815 		hp->gen_array[0].generation = hp->generation;
816 		hp->gen_array[0].txn_min = min;
817 		hp->gen_array[0].txn_max = max;
818 	}
819 	return (0);
820 }
821 
822 /*
823  * __db_txnlist_lsnadd --
824  *	Save the prev_lsn from a txn_child record.
825  *
826  * PUBLIC: int __db_txnlist_lsnadd __P((ENV *, DB_TXNHEAD *, DB_LSN *));
827  */
828 int
__db_txnlist_lsnadd(env,hp,lsnp)829 __db_txnlist_lsnadd(env, hp, lsnp)
830 	ENV *env;
831 	DB_TXNHEAD *hp;
832 	DB_LSN *lsnp;
833 {
834 	DB_TXNLIST *elp;
835 	int ret;
836 
837 	if (IS_ZERO_LSN(*lsnp))
838 		return (0);
839 
840 	LIST_FOREACH(elp, &hp->head[0], links)
841 		if (elp->type == TXNLIST_LSN)
842 			break;
843 
844 	if (elp == NULL) {
845 		if ((ret = __db_txnlist_lsninit(env, hp, lsnp)) != 0)
846 			return (ret);
847 		return (DB_SURPRISE_KID);
848 	}
849 
850 	if (elp->u.l.stack_indx == elp->u.l.stack_size) {
851 		elp->u.l.stack_size <<= 1;
852 		if ((ret = __os_realloc(env, sizeof(DB_LSN) *
853 		     elp->u.l.stack_size, &elp->u.l.lsn_stack)) != 0) {
854 			__db_txnlist_end(env, hp);
855 			return (ret);
856 		}
857 	}
858 	elp->u.l.lsn_stack[elp->u.l.stack_indx++] = *lsnp;
859 
860 	return (0);
861 }
862 
863 /*
864  * __db_txnlist_lsnget --
865  *
866  * PUBLIC: int __db_txnlist_lsnget __P((ENV *,
867  * PUBLIC:     DB_TXNHEAD *, DB_LSN *, u_int32_t));
868  *	Get the lsn saved from a txn_child record.
869  */
870 int
__db_txnlist_lsnget(env,hp,lsnp,flags)871 __db_txnlist_lsnget(env, hp, lsnp, flags)
872 	ENV *env;
873 	DB_TXNHEAD *hp;
874 	DB_LSN *lsnp;
875 	u_int32_t flags;
876 {
877 	DB_TXNLIST *elp;
878 
879 	COMPQUIET(env, NULL);
880 	COMPQUIET(flags, 0);
881 
882 	LIST_FOREACH(elp, &hp->head[0], links)
883 		if (elp->type == TXNLIST_LSN)
884 			break;
885 
886 	if (elp == NULL || elp->u.l.stack_indx == 0) {
887 		ZERO_LSN(*lsnp);
888 		return (0);
889 	}
890 
891 	*lsnp = elp->u.l.lsn_stack[--elp->u.l.stack_indx];
892 
893 	return (0);
894 }
895 
896 /*
897  * __db_txnlist_lsninit --
898  *	Initialize a transaction list with an lsn array entry.
899  *
900  * PUBLIC: int __db_txnlist_lsninit __P((ENV *, DB_TXNHEAD *, DB_LSN *));
901  */
902 int
__db_txnlist_lsninit(env,hp,lsnp)903 __db_txnlist_lsninit(env, hp, lsnp)
904 	ENV *env;
905 	DB_TXNHEAD *hp;
906 	DB_LSN *lsnp;
907 {
908 	DB_TXNLIST *elp;
909 	int ret;
910 
911 	elp = NULL;
912 
913 	if ((ret = __os_malloc(env, sizeof(DB_TXNLIST), &elp)) != 0)
914 		goto err;
915 	LIST_INSERT_HEAD(&hp->head[0], elp, links);
916 
917 	elp->type = TXNLIST_LSN;
918 	if ((ret = __os_malloc(env,
919 	    sizeof(DB_LSN) * DB_LSN_STACK_SIZE, &elp->u.l.lsn_stack)) != 0)
920 		goto err;
921 	elp->u.l.stack_indx = 1;
922 	elp->u.l.stack_size = DB_LSN_STACK_SIZE;
923 	elp->u.l.lsn_stack[0] = *lsnp;
924 
925 	return (0);
926 
927 err:	__db_txnlist_end(env, hp);
928 	return (ret);
929 }
930 
931 #ifdef DEBUG
932 /*
933  * __db_txnlist_print --
934  *	Print out the transaction list.
935  *
936  * PUBLIC: void __db_txnlist_print __P((DB_TXNHEAD *));
937  */
938 void
__db_txnlist_print(hp)939 __db_txnlist_print(hp)
940 	DB_TXNHEAD *hp;
941 {
942 	DB_TXNLIST *p;
943 	u_int32_t i;
944 	char *txntype;
945 
946 	printf("Maxid: %lu Generation: %lu\n",
947 	    (u_long)hp->maxid, (u_long)hp->generation);
948 	for (i = 0; i < hp->nslots; i++)
949 		LIST_FOREACH(p, &hp->head[i], links) {
950 			if (p->type != TXNLIST_TXNID) {
951 				printf("Unrecognized type: %d\n", p->type);
952 				continue;
953 			}
954 			switch (p->u.t.status) {
955 			case TXN_OK:
956 				txntype = "OK";
957 				break;
958 			case TXN_COMMIT:
959 				txntype = "commit";
960 				break;
961 			case TXN_PREPARE:
962 				txntype = "prepare";
963 				break;
964 			case TXN_ABORT:
965 				txntype = "abort";
966 				break;
967 			case TXN_IGNORE:
968 				txntype = "ignore";
969 				break;
970 			case TXN_EXPECTED:
971 				txntype = "expected";
972 				break;
973 			case TXN_UNEXPECTED:
974 				txntype = "unexpected";
975 				break;
976 			default:
977 				txntype = "UNKNOWN";
978 				break;
979 			}
980 			printf("TXNID: %lx(%lu): %s\n",
981 			    (u_long)p->u.t.txnid,
982 			    (u_long)p->u.t.generation, txntype);
983 		}
984 }
985 #endif
986