1 /*-
2 * Copyright (c) 1996, 2020 Oracle and/or its affiliates. All rights reserved.
3 *
4 * See the file LICENSE for license information.
5 */
6 /*
7 * Copyright (c) 1995, 1996
8 * The President and Fellows of Harvard University. All rights reserved.
9 *
10 * This code is derived from software contributed to Berkeley by
11 * Margo Seltzer.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 * 3. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * $Id$
38 */
39
40 #include "db_config.h"
41
42 #include "db_int.h"
43 #include "dbinc/db_page.h"
44 #include "dbinc/hash.h"
45 #include "dbinc/fop.h"
46 #include "dbinc/lock.h"
47 #include "dbinc/mp.h"
48 #include "dbinc/txn.h"
49 #include "dbinc/log_verify.h"
50
51 static int __db_txnlist_find_internal __P((ENV *, DB_TXNHEAD *,
52 db_txnlist_type, u_int32_t, DB_TXNLIST **,
53 int, u_int32_t *));
54
55 /*
56 * __db_dispatch --
57 * Decide whether a given log record should be applied for a given
58 * "recovery" mode. If so, call the corresponding function in the
59 * DB_DISTAB dispatch table.
60 *
61 * Parameters:
62 * dtab - a dispatch table mapping the logrec type to the function to
63 * call. The normal one is built into env->recover_dtab.
64 * logrec - a transaction log record as a byte stream, in a DBT. This
65 * describes the data to dispatch.
66 * lsnp - logrec's LSN
67 * recop - what to do; usually one of recovery's steps in applying or
68 * undoing a txn; the others are DB_TXN_PRINT & DB_TXN_LOG_VERIFY.
69 * params - A pointer to pass to the function. It usually is a DB_TXNHEAD,
70 * but is DB_LOG when printing or DB_LOG_VERIFY for verify.
71 *
72 * PUBLIC: int __db_dispatch __P((ENV *,
73 * PUBLIC: DB_DISTAB *, DBT *, DB_LSN *, db_recops, void *));
74 */
75 int
__db_dispatch(env,dtab,logrec,lsnp,recop,params)76 __db_dispatch(env, dtab, logrec, lsnp, recop, params)
77 ENV *env;
78 DB_DISTAB *dtab;
79 DBT *logrec; /* The log record upon which to dispatch. */
80 DB_LSN *lsnp; /* The lsn of the record being dispatched. */
81 db_recops recop; /* Redo this op (or undo it). */
82 void *params;
83 {
84 DB_ENV *dbenv;
85 DB_TXNHEAD *txninfo;
86 DB_LOG_VRFY_INFO *lvh;
87 DB_LSN prev_lsn;
88 u_int32_t rectype, status, txnid, urectype;
89 int make_call, ret;
90
91 dbenv = env->dbenv;
92 make_call = ret = 0;
93 lvh = NULL;
94 txninfo = NULL;
95 LOGCOPY_32(env, &rectype, logrec->data);
96 LOGCOPY_32(env, &txnid, (u_int8_t *)logrec->data + sizeof(rectype));
97
98 /*
99 * Log verification passes a DB_LOG_VRFY_INFO structure, log print
100 * passes in a DB_LOG, everyone else passes a DB_TXNHEAD structure.
101 */
102 if (recop != DB_TXN_LOG_VERIFY)
103 txninfo = (DB_TXNHEAD *)params;
104 else
105 lvh = (DB_LOG_VRFY_INFO *)params;
106
107 /* If we don't have a dispatch table, it's hard to dispatch. */
108 DB_ASSERT(env, dtab != NULL);
109
110 /*
111 * If we find a record that is in the user's number space and they
112 * have specified a recovery routine, let them handle it. If they
113 * didn't specify a recovery routine, then we expect that they've
114 * followed all our rules and registered new recovery functions.
115 */
116 switch (recop) {
117 case DB_TXN_ABORT:
118 case DB_TXN_APPLY:
119 case DB_TXN_LOG_VERIFY:
120 case DB_TXN_PRINT:
121 make_call = 1;
122 break;
123 case DB_TXN_OPENFILES:
124 /*
125 * We collect all the transactions that have
126 * "begin" records, those with no previous LSN,
127 * so that we do not abort partial transactions.
128 * These are known to be undone, otherwise the
129 * log would not have been freeable.
130 */
131 LOGCOPY_TOLSN(env, &prev_lsn, (u_int8_t *)logrec->data +
132 sizeof(rectype) + sizeof(txnid));
133 if (txnid != 0 && prev_lsn.file == 0 && (ret =
134 __db_txnlist_add(env, txninfo, txnid, TXN_OK, NULL)) != 0)
135 return (ret);
136
137 /* FALLTHROUGH */
138 case DB_TXN_POPENFILES:
139 if (rectype == DB___dbreg_register ||
140 rectype == DB___txn_child ||
141 rectype == DB___txn_ckp || rectype == DB___txn_recycle)
142 return ((dtab->int_dispatch[rectype])(env,
143 logrec, lsnp, recop, params));
144 break;
145 case DB_TXN_BACKWARD_ROLL:
146 /*
147 * Running full recovery in the backward pass. In general,
148 * we only process records during this pass that belong
149 * to aborted transactions. Unfortunately, there are several
150 * exceptions:
151 * 1. If this is a meta-record, one not associated with
152 * a transaction, then we must always process it.
153 * 2. If this is a transaction commit/abort, we must
154 * always process it, so that we know the status of
155 * every transaction.
156 * 3. If this is a child commit, we need to process it
157 * because the outcome of the child transaction depends
158 * on the outcome of the parent.
159 * 4. If this is a dbreg_register record, we must always
160 * process is because they contain non-transactional
161 * closes that must be properly handled.
162 * 5. If this is a noop, we must always undo it so that we
163 * properly handle any aborts before a file was closed.
164 * 6. If this a file remove, we need to process it to
165 * determine if the on-disk file is the same as the
166 * one being described.
167 */
168 switch (rectype) {
169 /*
170 * These either do not belong to a transaction or (regop)
171 * must be processed regardless of the status of the
172 * transaction.
173 */
174 case DB___txn_regop:
175 case DB___txn_recycle:
176 case DB___txn_ckp:
177 make_call = 1;
178 break;
179 /*
180 * These belong to a transaction whose status must be
181 * checked.
182 */
183 case DB___txn_child:
184 case DB___db_noop:
185 case DB___fop_file_remove:
186 case DB___dbreg_register:
187 make_call = 1;
188
189 /* FALLTHROUGH */
190 default:
191 if (txnid == 0)
192 break;
193
194 ret = __db_txnlist_find(env, txninfo, txnid, &status);
195
196 /* If not found, this is an incomplete abort. */
197 if (ret == DB_NOTFOUND)
198 return (__db_txnlist_add(env,
199 txninfo, txnid, TXN_IGNORE, lsnp));
200 if (ret != 0)
201 return (ret);
202
203 /*
204 * If we ignore the transaction, ignore the operation
205 * UNLESS this is a child commit in which case we need
206 * to make sure that the child also gets marked as
207 * ignore.
208 */
209 if (status == TXN_IGNORE && rectype != DB___txn_child) {
210 make_call = 0;
211 break;
212 }
213 if (status == TXN_COMMIT)
214 break;
215
216 /* Set make_call in case we came through default */
217 make_call = 1;
218 if (status == TXN_OK &&
219 (ret = __db_txnlist_update(env,
220 txninfo, txnid, rectype == DB___txn_prepare ?
221 TXN_PREPARE : TXN_ABORT, NULL, &status, 0)) != 0)
222 return (ret);
223 }
224 break;
225 case DB_TXN_FORWARD_ROLL:
226 /*
227 * In the forward pass, if we haven't seen the transaction,
228 * do nothing, else recover it.
229 *
230 * We need to always redo DB___db_noop records, so that we
231 * properly handle any commits after the file was closed.
232 */
233 switch (rectype) {
234 case DB___txn_recycle:
235 case DB___txn_ckp:
236 case DB___db_noop:
237 case DB___dbreg_register:
238 make_call = 1;
239 break;
240
241 default:
242 if (txnid == 0)
243 status = 0;
244 else {
245 ret = __db_txnlist_find(env,
246 txninfo, txnid, &status);
247
248 if (ret == DB_NOTFOUND)
249 /* Break out of if clause. */
250 ;
251 else if (ret != 0)
252 return (ret);
253 else if (status == TXN_COMMIT) {
254 make_call = 1;
255 break;
256 }
257 }
258
259 }
260 break;
261 default:
262 return (__db_unknown_flag(
263 env, "__db_dispatch", (u_int32_t)recop));
264 }
265
266 if (make_call) {
267 /*
268 * If the debug flag is set then we are logging
269 * records for a non-durable update so that they
270 * may be examined for diagnostic purposes.
271 * So only make the call if we are printing,
272 * otherwise we need to extract the previous
273 * lsn so undo will work properly.
274 */
275 if (rectype & DB_debug_FLAG) {
276 if (recop == DB_TXN_PRINT)
277 rectype &= ~DB_debug_FLAG;
278 else {
279 LOGCOPY_TOLSN(env, lsnp,
280 (u_int8_t *)logrec->data +
281 sizeof(rectype) + sizeof(txnid));
282 return (0);
283 }
284 }
285 if (rectype >= DB_user_BEGIN) {
286 /*
287 * Increment user log count, we can't pass any extra
288 * args into app_dispatch, so this has to be done here.
289 */
290 if (recop == DB_TXN_LOG_VERIFY)
291 lvh->external_logrec_cnt++;
292 if (dbenv->app_dispatch != NULL)
293 return (dbenv->app_dispatch(dbenv,
294 logrec, lsnp, recop));
295
296 /* No application-specific dispatch */
297 urectype = rectype - DB_user_BEGIN;
298 if (urectype > dtab->ext_size ||
299 dtab->ext_dispatch[urectype] == NULL) {
300 __db_errx(env, DB_STR_A("0512",
301 "Illegal application-specific record type %lu in log",
302 "%lu"), (u_long)rectype);
303 return (EINVAL);
304 }
305
306 return ((dtab->ext_dispatch[urectype])(dbenv,
307 logrec, lsnp, recop));
308 } else {
309 if (rectype > dtab->int_size ||
310 dtab->int_dispatch[rectype] == NULL) {
311 __db_errx(env, DB_STR_A("0513",
312 "Illegal record type %lu in log", "%lu"),
313 (u_long)rectype);
314 if (recop == DB_TXN_LOG_VERIFY)
315 lvh->unknown_logrec_cnt++;
316
317 return (EINVAL);
318 }
319
320 return ((dtab->int_dispatch[rectype])(env,
321 logrec, lsnp, recop, params));
322 }
323 }
324
325 return (0);
326 }
327
328 /*
329 * __db_add_recovery -- Add recovery functions to the dispatch table.
330 *
331 * This is an undocumented public API entry point.
332 *
333 * We have two versions of this, an external one and an internal one,
334 * because application-specific functions take different arguments
335 * for dispatch (ENV versus DB_ENV).
336 *
337 * PUBLIC: int __db_add_recovery __P((DB_ENV *, DB_DISTAB *,
338 * PUBLIC: int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops), u_int32_t));
339 */
340 int
__db_add_recovery(dbenv,dtab,func,ndx)341 __db_add_recovery(dbenv, dtab, func, ndx)
342 DB_ENV *dbenv;
343 DB_DISTAB *dtab;
344 int (*func) __P((DB_ENV *, DBT *, DB_LSN *, db_recops));
345 u_int32_t ndx;
346 {
347 size_t i, newsize;
348 int ret;
349
350 /* Make sure this is an application-specific record. */
351 if (ndx < DB_user_BEGIN) {
352 __db_errx(dbenv->env, DB_STR_A("0514",
353 "Attempting to add application-specific record with invalid type %lu",
354 "%lu"), (u_long)ndx);
355 return (EINVAL);
356 }
357 ndx -= DB_user_BEGIN;
358
359 /* Check if we have to grow the table. */
360 if (ndx >= dtab->ext_size) {
361 newsize = ndx + 40;
362 if ((ret = __os_realloc(dbenv->env, newsize *
363 sizeof((dtab->ext_dispatch)[0]), &dtab->ext_dispatch)) != 0)
364 return (ret);
365 for (i = dtab->ext_size; i < newsize; ++i)
366 (dtab->ext_dispatch)[i] = NULL;
367 dtab->ext_size = newsize;
368 }
369
370 (dtab->ext_dispatch)[ndx] = func;
371 return (0);
372 }
373
374 /*
375 * __db_add_recovery_int --
376 *
377 * This, the internal version of dispatch addition function, takes a function
378 * that has an ENV (rather than a DB_ENV) and a extra void * parameter that is
379 * passed through __db_dispatch() to the function. It would be useful for the
380 * external version to also support a void *, but it wasn't thought of in time.
381 *
382 * In current practice, the extra parameter always is a DB_TXNHEAD *.
383 *
384 * PUBLIC: int __db_add_recovery_int __P((ENV *, DB_DISTAB *,
385 * PUBLIC: int (*)(ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t));
386 */
387 int
__db_add_recovery_int(env,dtab,func,ndx)388 __db_add_recovery_int(env, dtab, func, ndx)
389 ENV *env;
390 DB_DISTAB *dtab;
391 int (*func) __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
392 u_int32_t ndx;
393 {
394 size_t i, newsize;
395 int ret;
396
397 if (ndx >= DB_user_BEGIN) {
398 __db_errx(env, DB_STR_A("0515",
399 "Attempting to add internal record with invalid type %lu",
400 "%lu"), (u_long)ndx);
401 return (EINVAL);
402 }
403
404 /* Check if we have to grow the table. */
405 if (ndx >= dtab->int_size) {
406 newsize = ndx + 40;
407 if ((ret = __os_realloc(env, newsize *
408 sizeof((dtab->int_dispatch)[0]), &dtab->int_dispatch)) != 0)
409 return (ret);
410 for (i = dtab->int_size; i < newsize; ++i)
411 (dtab->int_dispatch)[i] = NULL;
412 dtab->int_size = newsize;
413 }
414
415 (dtab->int_dispatch)[ndx] = func;
416 return (0);
417 }
418
419 /*
420 * __db_txnlist_init --
421 * Initialize transaction linked list.
422 *
423 * PUBLIC: int __db_txnlist_init __P((ENV *, DB_THREAD_INFO *,
424 * PUBLIC: u_int32_t, u_int32_t, DB_LSN *, DB_TXNHEAD **));
425 */
426 int
__db_txnlist_init(env,ip,low_txn,hi_txn,trunc_lsn,retp)427 __db_txnlist_init(env, ip, low_txn, hi_txn, trunc_lsn, retp)
428 ENV *env;
429 DB_THREAD_INFO *ip;
430 u_int32_t low_txn, hi_txn;
431 DB_LSN *trunc_lsn;
432 DB_TXNHEAD **retp;
433 {
434 DB_TXNHEAD *headp;
435 u_int32_t size, tmp;
436 int ret;
437
438 /*
439 * Size a hash table.
440 * If low is zero then we are being called during rollback
441 * and we need only one slot.
442 * Hi maybe lower than low if we have recycled txnid's.
443 * The numbers here are guesses about txn density, we can afford
444 * to look at a few entries in each slot.
445 */
446 if (low_txn == 0)
447 size = 1;
448 else {
449 if (hi_txn < low_txn) {
450 tmp = hi_txn;
451 hi_txn = low_txn;
452 low_txn = tmp;
453 }
454 tmp = hi_txn - low_txn;
455 /* See if we wrapped around. */
456 if (tmp > (TXN_MAXIMUM - TXN_MINIMUM) / 2)
457 tmp = (low_txn - TXN_MINIMUM) + (TXN_MAXIMUM - hi_txn);
458 size = tmp / 5;
459 if (size < 100)
460 size = 100;
461 }
462 if ((ret = __os_malloc(env,
463 sizeof(DB_TXNHEAD) + size * sizeof(headp->head), &headp)) != 0)
464 return (ret);
465
466 memset(headp, 0, sizeof(DB_TXNHEAD) + size * sizeof(headp->head));
467 headp->maxid = hi_txn;
468 headp->generation = 0;
469 headp->nslots = size;
470 headp->gen_alloc = 8;
471 headp->thread_info = ip;
472 if ((ret = __os_malloc(env, headp->gen_alloc *
473 sizeof(headp->gen_array[0]), &headp->gen_array)) != 0) {
474 __os_free(env, headp);
475 return (ret);
476 }
477 headp->gen_array[0].generation = 0;
478 headp->gen_array[0].txn_min = TXN_MINIMUM;
479 headp->gen_array[0].txn_max = TXN_MAXIMUM;
480 if (trunc_lsn != NULL) {
481 headp->trunc_lsn = *trunc_lsn;
482 headp->maxlsn = *trunc_lsn;
483 } else {
484 ZERO_LSN(headp->trunc_lsn);
485 ZERO_LSN(headp->maxlsn);
486 }
487 ZERO_LSN(headp->ckplsn);
488
489 *retp = headp;
490 return (0);
491 }
492
493 #define FIND_GENERATION(hp, txnid, gen) do { \
494 u_int32_t __i; \
495 for (__i = 0; __i <= (hp)->generation; __i++) \
496 /* The range may wrap around the end. */ \
497 if ((hp)->gen_array[__i].txn_min < \
498 (hp)->gen_array[__i].txn_max ? \
499 ((txnid) >= (hp)->gen_array[__i].txn_min && \
500 (txnid) <= (hp)->gen_array[__i].txn_max) : \
501 ((txnid) >= (hp)->gen_array[__i].txn_min || \
502 (txnid) <= (hp)->gen_array[__i].txn_max)) \
503 break; \
504 DB_ASSERT(env, __i <= (hp)->generation); \
505 gen = (hp)->gen_array[__i].generation; \
506 } while (0)
507
508 /*
509 * __db_txnlist_add --
510 * Add an element to our transaction linked list.
511 *
512 * PUBLIC: int __db_txnlist_add __P((ENV *,
513 * PUBLIC: DB_TXNHEAD *, u_int32_t, u_int32_t, DB_LSN *));
514 */
515 int
__db_txnlist_add(env,hp,txnid,status,lsn)516 __db_txnlist_add(env, hp, txnid, status, lsn)
517 ENV *env;
518 DB_TXNHEAD *hp;
519 u_int32_t txnid, status;
520 DB_LSN *lsn;
521 {
522 DB_TXNLIST *elp;
523 int ret;
524
525 if ((ret = __os_malloc(env, sizeof(DB_TXNLIST), &elp)) != 0)
526 return (ret);
527
528 LIST_INSERT_HEAD(&hp->head[DB_TXNLIST_MASK(hp, txnid)], elp, links);
529
530 /* Find the most recent generation containing this ID */
531 FIND_GENERATION(hp, txnid, elp->u.t.generation);
532 elp->type = TXNLIST_TXNID;
533 elp->u.t.txnid = txnid;
534 elp->u.t.status = status;
535 if (txnid > hp->maxid)
536 hp->maxid = txnid;
537 if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
538 hp->maxlsn = *lsn;
539
540 DB_ASSERT(env, lsn == NULL ||
541 status != TXN_COMMIT || LOG_COMPARE(&hp->maxlsn, lsn) >= 0);
542
543 return (0);
544 }
545
546 /*
547 * __db_txnlist_remove --
548 * Remove an element from our transaction linked list.
549 *
550 * PUBLIC: int __db_txnlist_remove __P((ENV *, DB_TXNHEAD *, u_int32_t));
551 */
552 int
__db_txnlist_remove(env,hp,txnid)553 __db_txnlist_remove(env, hp, txnid)
554 ENV *env;
555 DB_TXNHEAD *hp;
556 u_int32_t txnid;
557 {
558 DB_TXNLIST *entry;
559 u_int32_t status;
560
561 return (__db_txnlist_find_internal(env,
562 hp, TXNLIST_TXNID, txnid, &entry, 1, &status));
563 }
564
565 /*
566 * __db_txnlist_ckp --
567 * Used to record the maximum checkpoint that will be retained
568 * after recovery. Typically this is simply the max checkpoint, but
569 * if we are doing client replication recovery or timestamp-based
570 * recovery, we are going to virtually truncate the log and we need
571 * to retain the last checkpoint before the truncation point.
572 *
573 * PUBLIC: void __db_txnlist_ckp __P((ENV *, DB_TXNHEAD *, DB_LSN *));
574 */
575 void
__db_txnlist_ckp(env,hp,ckp_lsn)576 __db_txnlist_ckp(env, hp, ckp_lsn)
577 ENV *env;
578 DB_TXNHEAD *hp;
579 DB_LSN *ckp_lsn;
580 {
581
582 COMPQUIET(env, NULL);
583
584 if (IS_ZERO_LSN(hp->ckplsn) && !IS_ZERO_LSN(hp->maxlsn) &&
585 LOG_COMPARE(&hp->maxlsn, ckp_lsn) >= 0)
586 hp->ckplsn = *ckp_lsn;
587 }
588
589 /*
590 * __db_txnlist_end --
591 * Discard transaction linked list.
592 *
593 * PUBLIC: void __db_txnlist_end __P((ENV *, DB_TXNHEAD *));
594 */
595 void
__db_txnlist_end(env,hp)596 __db_txnlist_end(env, hp)
597 ENV *env;
598 DB_TXNHEAD *hp;
599 {
600 u_int32_t i;
601 DB_TXNLIST *p;
602
603 if (hp == NULL)
604 return;
605
606 for (i = 0; i < hp->nslots; i++)
607 while (hp != NULL && (p = LIST_FIRST(&hp->head[i])) != NULL) {
608 switch (p->type) {
609 case TXNLIST_LSN:
610 __os_free(env, p->u.l.lsn_stack);
611 break;
612 case TXNLIST_DELETE:
613 case TXNLIST_TXNID:
614 default:
615 /*
616 * Possibly an incomplete DB_TXNLIST; just
617 * free it.
618 */
619 break;
620 }
621 LIST_REMOVE(p, links);
622 __os_free(env, p);
623 }
624
625 if (hp->gen_array != NULL)
626 __os_free(env, hp->gen_array);
627 __os_free(env, hp);
628 }
629
630 /*
631 * __db_txnlist_find --
632 * Checks to see if a txnid with the current generation is in the
633 * txnid list. This returns DB_NOTFOUND if the item isn't in the
634 * list otherwise it returns (like __db_txnlist_find_internal)
635 * the status of the transaction. A txnid of 0 means the record
636 * was generated while not in a transaction.
637 *
638 * PUBLIC: int __db_txnlist_find __P((ENV *,
639 * PUBLIC: DB_TXNHEAD *, u_int32_t, u_int32_t *));
640 */
641 int
__db_txnlist_find(env,hp,txnid,statusp)642 __db_txnlist_find(env, hp, txnid, statusp)
643 ENV *env;
644 DB_TXNHEAD *hp;
645 u_int32_t txnid, *statusp;
646 {
647 DB_TXNLIST *entry;
648
649 if (txnid == 0)
650 return (USR_ERR(env, DB_NOTFOUND));
651
652 return (__db_txnlist_find_internal(env, hp,
653 TXNLIST_TXNID, txnid, &entry, 0, statusp));
654 }
655
656 /*
657 * __db_txnlist_update --
658 * Change the status of an existing transaction entry.
659 * Returns DB_NOTFOUND if no such entry exists.
660 *
661 * PUBLIC: int __db_txnlist_update __P((ENV *, DB_TXNHEAD *,
662 * PUBLIC: u_int32_t, u_int32_t, DB_LSN *, u_int32_t *, int));
663 */
664 int
__db_txnlist_update(env,hp,txnid,status,lsn,ret_status,add_ok)665 __db_txnlist_update(env, hp, txnid, status, lsn, ret_status, add_ok)
666 ENV *env;
667 DB_TXNHEAD *hp;
668 u_int32_t txnid, status;
669 DB_LSN *lsn;
670 u_int32_t *ret_status;
671 int add_ok;
672 {
673 DB_TXNLIST *elp;
674 int ret;
675
676 if (txnid == 0)
677 return (USR_ERR(env, DB_NOTFOUND));
678
679 ret = __db_txnlist_find_internal(env,
680 hp, TXNLIST_TXNID, txnid, &elp, 0, ret_status);
681
682 if (ret == DB_NOTFOUND && add_ok) {
683 *ret_status = status;
684 return (__db_txnlist_add(env, hp, txnid, status, lsn));
685 }
686 if (ret != 0)
687 return (ret);
688
689 if (*ret_status == TXN_IGNORE)
690 return (0);
691
692 elp->u.t.status = status;
693
694 if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
695 hp->maxlsn = *lsn;
696
697 return (ret);
698 }
699
700 /*
701 * __db_txnlist_find_internal --
702 * Find an entry on the transaction list. If the entry is not there or
703 * the list pointer is not initialized we return DB_NOTFOUND. If the
704 * item is found, we return the status. Currently we always call this
705 * with an initialized list pointer but checking for NULL keeps it general.
706 */
707 static int
__db_txnlist_find_internal(env,hp,type,txnid,txnlistp,del,statusp)708 __db_txnlist_find_internal(env,
709 hp, type, txnid, txnlistp, del, statusp)
710 ENV *env;
711 DB_TXNHEAD *hp;
712 db_txnlist_type type;
713 u_int32_t txnid;
714 DB_TXNLIST **txnlistp;
715 int del;
716 u_int32_t *statusp;
717 {
718 struct __db_headlink *head;
719 DB_TXNLIST *p;
720 u_int32_t generation, hash;
721 int ret;
722
723 ret = 0;
724
725 if (hp == NULL)
726 return (USR_ERR(env, DB_NOTFOUND));
727
728 switch (type) {
729 case TXNLIST_TXNID:
730 hash = txnid;
731 FIND_GENERATION(hp, txnid, generation);
732 break;
733 case TXNLIST_DELETE:
734 case TXNLIST_LSN:
735 default:
736 return (__env_panic(env, EINVAL));
737 }
738
739 head = &hp->head[DB_TXNLIST_MASK(hp, hash)];
740 LIST_FOREACH(p, head, links) {
741 if (p->type != type)
742 continue;
743 switch (type) {
744 case TXNLIST_TXNID:
745 if (p->u.t.txnid != txnid ||
746 generation != p->u.t.generation)
747 continue;
748 *statusp = p->u.t.status;
749 break;
750
751 case TXNLIST_DELETE:
752 case TXNLIST_LSN:
753 default:
754 return (__env_panic(env, EINVAL));
755 }
756 if (del == 1) {
757 LIST_REMOVE(p, links);
758 __os_free(env, p);
759 *txnlistp = NULL;
760 } else if (p != LIST_FIRST(head)) {
761 /* Move it to head of list. */
762 LIST_REMOVE(p, links);
763 LIST_INSERT_HEAD(head, p, links);
764 *txnlistp = p;
765 } else
766 *txnlistp = p;
767 return (ret);
768 }
769
770 return (USR_ERR(env, DB_NOTFOUND));
771 }
772
773 /*
774 * __db_txnlist_gen --
775 * Change the current generation number.
776 *
777 * PUBLIC: int __db_txnlist_gen __P((ENV *,
778 * PUBLIC: DB_TXNHEAD *, int, u_int32_t, u_int32_t));
779 */
780 int
__db_txnlist_gen(env,hp,incr,min,max)781 __db_txnlist_gen(env, hp, incr, min, max)
782 ENV *env;
783 DB_TXNHEAD *hp;
784 int incr;
785 u_int32_t min, max;
786 {
787 int ret;
788
789 /*
790 * During recovery generation numbers keep track of "restart"
791 * checkpoints and recycle records. Restart checkpoints occur
792 * whenever we take a checkpoint and there are no outstanding
793 * transactions. When that happens, we can reset transaction IDs
794 * back to TXNID_MINIMUM. Currently we only do the reset
795 * at then end of recovery. Recycle records occur when txnids
796 * are exhausted during runtime. A free range of ids is identified
797 * and logged. This code maintains a stack of ranges. A txnid
798 * is given the generation number of the first range it falls into
799 * in the stack.
800 */
801 if (incr < 0) {
802 --hp->generation;
803 memmove(hp->gen_array, &hp->gen_array[1],
804 (hp->generation + 1) * sizeof(hp->gen_array[0]));
805 } else {
806 ++hp->generation;
807 if (hp->generation >= hp->gen_alloc) {
808 hp->gen_alloc *= 2;
809 if ((ret = __os_realloc(env, hp->gen_alloc *
810 sizeof(hp->gen_array[0]), &hp->gen_array)) != 0)
811 return (ret);
812 }
813 memmove(&hp->gen_array[1], &hp->gen_array[0],
814 hp->generation * sizeof(hp->gen_array[0]));
815 hp->gen_array[0].generation = hp->generation;
816 hp->gen_array[0].txn_min = min;
817 hp->gen_array[0].txn_max = max;
818 }
819 return (0);
820 }
821
822 /*
823 * __db_txnlist_lsnadd --
824 * Save the prev_lsn from a txn_child record.
825 *
826 * PUBLIC: int __db_txnlist_lsnadd __P((ENV *, DB_TXNHEAD *, DB_LSN *));
827 */
828 int
__db_txnlist_lsnadd(env,hp,lsnp)829 __db_txnlist_lsnadd(env, hp, lsnp)
830 ENV *env;
831 DB_TXNHEAD *hp;
832 DB_LSN *lsnp;
833 {
834 DB_TXNLIST *elp;
835 int ret;
836
837 if (IS_ZERO_LSN(*lsnp))
838 return (0);
839
840 LIST_FOREACH(elp, &hp->head[0], links)
841 if (elp->type == TXNLIST_LSN)
842 break;
843
844 if (elp == NULL) {
845 if ((ret = __db_txnlist_lsninit(env, hp, lsnp)) != 0)
846 return (ret);
847 return (DB_SURPRISE_KID);
848 }
849
850 if (elp->u.l.stack_indx == elp->u.l.stack_size) {
851 elp->u.l.stack_size <<= 1;
852 if ((ret = __os_realloc(env, sizeof(DB_LSN) *
853 elp->u.l.stack_size, &elp->u.l.lsn_stack)) != 0) {
854 __db_txnlist_end(env, hp);
855 return (ret);
856 }
857 }
858 elp->u.l.lsn_stack[elp->u.l.stack_indx++] = *lsnp;
859
860 return (0);
861 }
862
863 /*
864 * __db_txnlist_lsnget --
865 *
866 * PUBLIC: int __db_txnlist_lsnget __P((ENV *,
867 * PUBLIC: DB_TXNHEAD *, DB_LSN *, u_int32_t));
868 * Get the lsn saved from a txn_child record.
869 */
870 int
__db_txnlist_lsnget(env,hp,lsnp,flags)871 __db_txnlist_lsnget(env, hp, lsnp, flags)
872 ENV *env;
873 DB_TXNHEAD *hp;
874 DB_LSN *lsnp;
875 u_int32_t flags;
876 {
877 DB_TXNLIST *elp;
878
879 COMPQUIET(env, NULL);
880 COMPQUIET(flags, 0);
881
882 LIST_FOREACH(elp, &hp->head[0], links)
883 if (elp->type == TXNLIST_LSN)
884 break;
885
886 if (elp == NULL || elp->u.l.stack_indx == 0) {
887 ZERO_LSN(*lsnp);
888 return (0);
889 }
890
891 *lsnp = elp->u.l.lsn_stack[--elp->u.l.stack_indx];
892
893 return (0);
894 }
895
896 /*
897 * __db_txnlist_lsninit --
898 * Initialize a transaction list with an lsn array entry.
899 *
900 * PUBLIC: int __db_txnlist_lsninit __P((ENV *, DB_TXNHEAD *, DB_LSN *));
901 */
902 int
__db_txnlist_lsninit(env,hp,lsnp)903 __db_txnlist_lsninit(env, hp, lsnp)
904 ENV *env;
905 DB_TXNHEAD *hp;
906 DB_LSN *lsnp;
907 {
908 DB_TXNLIST *elp;
909 int ret;
910
911 elp = NULL;
912
913 if ((ret = __os_malloc(env, sizeof(DB_TXNLIST), &elp)) != 0)
914 goto err;
915 LIST_INSERT_HEAD(&hp->head[0], elp, links);
916
917 elp->type = TXNLIST_LSN;
918 if ((ret = __os_malloc(env,
919 sizeof(DB_LSN) * DB_LSN_STACK_SIZE, &elp->u.l.lsn_stack)) != 0)
920 goto err;
921 elp->u.l.stack_indx = 1;
922 elp->u.l.stack_size = DB_LSN_STACK_SIZE;
923 elp->u.l.lsn_stack[0] = *lsnp;
924
925 return (0);
926
927 err: __db_txnlist_end(env, hp);
928 return (ret);
929 }
930
931 #ifdef DEBUG
932 /*
933 * __db_txnlist_print --
934 * Print out the transaction list.
935 *
936 * PUBLIC: void __db_txnlist_print __P((DB_TXNHEAD *));
937 */
938 void
__db_txnlist_print(hp)939 __db_txnlist_print(hp)
940 DB_TXNHEAD *hp;
941 {
942 DB_TXNLIST *p;
943 u_int32_t i;
944 char *txntype;
945
946 printf("Maxid: %lu Generation: %lu\n",
947 (u_long)hp->maxid, (u_long)hp->generation);
948 for (i = 0; i < hp->nslots; i++)
949 LIST_FOREACH(p, &hp->head[i], links) {
950 if (p->type != TXNLIST_TXNID) {
951 printf("Unrecognized type: %d\n", p->type);
952 continue;
953 }
954 switch (p->u.t.status) {
955 case TXN_OK:
956 txntype = "OK";
957 break;
958 case TXN_COMMIT:
959 txntype = "commit";
960 break;
961 case TXN_PREPARE:
962 txntype = "prepare";
963 break;
964 case TXN_ABORT:
965 txntype = "abort";
966 break;
967 case TXN_IGNORE:
968 txntype = "ignore";
969 break;
970 case TXN_EXPECTED:
971 txntype = "expected";
972 break;
973 case TXN_UNEXPECTED:
974 txntype = "unexpected";
975 break;
976 default:
977 txntype = "UNKNOWN";
978 break;
979 }
980 printf("TXNID: %lx(%lu): %s\n",
981 (u_long)p->u.t.txnid,
982 (u_long)p->u.t.generation, txntype);
983 }
984 }
985 #endif
986