1 /*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996, 2013 Oracle and/or its affiliates. All rights reserved.
5 *
6 * $Id$
7 */
8
9 #include "db_config.h"
10
11 #include "db_int.h"
12 #include "dbinc/crypto.h"
13 #include "dbinc/hmac.h"
14 #include "dbinc/log.h"
15 #include "dbinc/txn.h"
16 #include "dbinc/db_page.h"
17 #include "dbinc/db_am.h"
18
19 static int __log_init __P((ENV *, DB_LOG *));
20 static int __log_recover __P((DB_LOG *));
21
22 /*
23 * __log_open --
24 * Internal version of log_open: only called from ENV->open.
25 *
26 * PUBLIC: int __log_open __P((ENV *));
27 */
28 int
__log_open(env)29 __log_open(env)
30 ENV *env;
31 {
32 DB_ENV *dbenv;
33 DB_LOG *dblp;
34 LOG *lp;
35 u_int8_t *bulk;
36 int region_locked, ret;
37
38 dbenv = env->dbenv;
39 region_locked = 0;
40
41 /* Create/initialize the DB_LOG structure. */
42 if ((ret = __os_calloc(env, 1, sizeof(DB_LOG), &dblp)) != 0)
43 return (ret);
44 dblp->env = env;
45
46 /* Join/create the log region. */
47 if ((ret = __env_region_share(env, &dblp->reginfo)) != 0)
48 goto err;
49
50 /* If we created the region, initialize it. */
51 if (F_ISSET(&dblp->reginfo, REGION_CREATE))
52 if ((ret = __log_init(env, dblp)) != 0)
53 goto err;
54
55 /* Set the local addresses. */
56 lp = dblp->reginfo.primary = R_ADDR(&dblp->reginfo,
57 ((REGENV *)env->reginfo->primary)->lg_primary);
58 dblp->bufp = R_ADDR(&dblp->reginfo, lp->buffer_off);
59
60 /*
61 * If the region is threaded, we have to lock the DBREG list, and we
62 * need to allocate a mutex for that purpose.
63 */
64 if ((ret = __mutex_alloc(env,
65 MTX_LOG_REGION, DB_MUTEX_PROCESS_ONLY, &dblp->mtx_dbreg)) != 0)
66 goto err;
67
68 /*
69 * Set the handle -- we may be about to run recovery, which allocates
70 * log cursors. Log cursors require logging be already configured,
71 * and the handle being set is what demonstrates that.
72 *
73 * If we created the region, run recovery. If that fails, make sure
74 * we reset the log handle before cleaning up, otherwise we will try
75 * and clean up again in the mainline ENV initialization code.
76 */
77 env->lg_handle = dblp;
78
79 if (F_ISSET(&dblp->reginfo, REGION_CREATE)) {
80 /*
81 * We first take the log file size from the environment, if
82 * specified. If that wasn't set, default it. Regardless,
83 * recovery may set it from the persistent information in a
84 * log file header.
85 */
86 if (lp->log_size == 0)
87 lp->log_size =
88 FLD_ISSET(dbenv->lg_flags, DB_LOG_IN_MEMORY) ?
89 LG_MAX_INMEM : LG_MAX_DEFAULT;
90
91 if ((ret = __log_recover(dblp)) != 0)
92 goto err;
93
94 /*
95 * If the next log file size hasn't been set yet, default it
96 * to the current log file size.
97 */
98 if (lp->log_nsize == 0)
99 lp->log_nsize = lp->log_size;
100
101 /*
102 * If we haven't written any log files, write the first one
103 * so that checkpoint gets a valid ckp_lsn value.
104 */
105 if (IS_INIT_LSN(lp->lsn) &&
106 (ret = __log_newfile(dblp, NULL, 0, 0)) != 0)
107 goto err;
108
109 /*
110 * Initialize replication's next-expected LSN value
111 * and replication's bulk buffer. In __env_open, we
112 * always create/open the replication region before
113 * the log region so we're assured that our rep_handle
114 * is valid at this point, if replication is being used.
115 */
116 lp->ready_lsn = lp->lsn;
117 if (IS_ENV_REPLICATED(env)) {
118 if ((ret =
119 __env_alloc(&dblp->reginfo, MEGABYTE, &bulk)) != 0)
120 goto err;
121 lp->bulk_buf = R_OFFSET(&dblp->reginfo, bulk);
122 lp->bulk_len = MEGABYTE;
123 lp->bulk_off = 0;
124 lp->wait_ts = env->rep_handle->request_gap;
125 __os_gettime(env, &lp->rcvd_ts, 1);
126 } else {
127 lp->bulk_buf = INVALID_ROFF;
128 lp->bulk_len = 0;
129 lp->bulk_off = 0;
130 }
131 } else {
132 /*
133 * A process joining the region may have reset the log file
134 * size, too. If so, it only affects the next log file we
135 * create. We need to check that the size is reasonable given
136 * the buffer size in the region.
137 */
138 LOG_SYSTEM_LOCK(env);
139 region_locked = 1;
140
141 if (dbenv->lg_size != 0) {
142 if ((ret =
143 __log_check_sizes(env, dbenv->lg_size, 0)) != 0)
144 goto err;
145
146 lp->log_nsize = dbenv->lg_size;
147 }
148
149 LOG_SYSTEM_UNLOCK(env);
150 region_locked = 0;
151
152 if (dbenv->lg_flags != 0 && (ret =
153 __log_set_config_int(dbenv, dbenv->lg_flags, 1, 0)) != 0)
154 return (ret);
155 }
156 dblp->reginfo.mtx_alloc = lp->mtx_region;
157
158 return (0);
159
160 err: if (dblp->reginfo.addr != NULL) {
161 if (region_locked)
162 LOG_SYSTEM_UNLOCK(env);
163 (void)__env_region_detach(env, &dblp->reginfo, 0);
164 }
165 env->lg_handle = NULL;
166
167 (void)__mutex_free(env, &dblp->mtx_dbreg);
168 __os_free(env, dblp);
169
170 return (ret);
171 }
172
173 /*
174 * __log_init --
175 * Initialize a log region in shared memory.
176 */
177 static int
__log_init(env,dblp)178 __log_init(env, dblp)
179 ENV *env;
180 DB_LOG *dblp;
181 {
182 DB_ENV *dbenv;
183 LOG *lp;
184 int ret;
185 void *p;
186
187 dbenv = env->dbenv;
188
189 /*
190 * This is the first point where we can validate the buffer size,
191 * because we know all three settings have been configured (file size,
192 * buffer size and the in-memory flag).
193 */
194 if ((ret =
195 __log_check_sizes(env, dbenv->lg_size, dbenv->lg_bsize)) != 0)
196 return (ret);
197
198 if ((ret = __env_alloc(&dblp->reginfo,
199 sizeof(*lp), &dblp->reginfo.primary)) != 0)
200 goto mem_err;
201
202 ((REGENV *)env->reginfo->primary)->lg_primary =
203 R_OFFSET(&dblp->reginfo, dblp->reginfo.primary);
204
205 lp = dblp->reginfo.primary;
206 memset(lp, 0, sizeof(*lp));
207
208 /* We share the region so we need the same mutex. */
209 lp->mtx_region = ((REGENV *)env->reginfo->primary)->mtx_regenv;
210
211 lp->fid_max = 0;
212 SH_TAILQ_INIT(&lp->fq);
213 lp->free_fid_stack = INVALID_ROFF;
214 lp->free_fids = lp->free_fids_alloced = 0;
215
216 /* Initialize LOG LSNs. */
217 INIT_LSN(lp->lsn);
218 INIT_LSN(lp->t_lsn);
219
220 /*
221 * It's possible to be waiting for an LSN of [1][0], if a replication
222 * client gets the first log record out of order. An LSN of [0][0]
223 * signifies that we're not waiting.
224 */
225 ZERO_LSN(lp->waiting_lsn);
226
227 /*
228 * Log makes note of the fact that it ran into a checkpoint on
229 * startup if it did so, as a recovery optimization. A zero
230 * LSN signifies that it hasn't found one [yet].
231 */
232 ZERO_LSN(lp->cached_ckp_lsn);
233
234 if ((ret =
235 __mutex_alloc(env, MTX_LOG_FILENAME, 0, &lp->mtx_filelist)) != 0)
236 return (ret);
237 if ((ret = __mutex_alloc(env, MTX_LOG_FLUSH, 0, &lp->mtx_flush)) != 0)
238 return (ret);
239
240 /* Initialize the buffer. */
241 if ((ret = __env_alloc(&dblp->reginfo, dbenv->lg_bsize, &p)) != 0) {
242 mem_err: __db_errx( env, DB_STR("2524",
243 "unable to allocate log region memory"));
244 return (ret);
245 }
246 lp->regionmax = dbenv->lg_regionmax;
247 lp->buffer_off = R_OFFSET(&dblp->reginfo, p);
248 lp->buffer_size = dbenv->lg_bsize;
249 lp->filemode = dbenv->lg_filemode;
250 lp->log_size = lp->log_nsize = dbenv->lg_size;
251 lp->stat.st_fileid_init = dbenv->lg_fileid_init;
252
253 /* Initialize the commit Queue. */
254 SH_TAILQ_INIT(&lp->free_commits);
255 SH_TAILQ_INIT(&lp->commits);
256 lp->ncommit = 0;
257
258 /* Initialize the logfiles list for in-memory logs. */
259 SH_TAILQ_INIT(&lp->logfiles);
260 SH_TAILQ_INIT(&lp->free_logfiles);
261
262 /*
263 * Fill in the log's persistent header. Don't fill in the log file
264 * sizes, as they may change at any time and so have to be filled in
265 * as each log file is created.
266 */
267 lp->persist.magic = DB_LOGMAGIC;
268 /*
269 * Don't use __log_set_version because env->dblp isn't set up yet.
270 */
271 lp->persist.version = DB_LOGVERSION;
272 lp->persist.notused = 0;
273 env->lg_handle = dblp;
274
275 /* Migrate persistent flags from the ENV into the region. */
276 if (dbenv->lg_flags != 0 &&
277 (ret = __log_set_config_int(dbenv, dbenv->lg_flags, 1, 1)) != 0)
278 return (ret);
279
280 (void)time(&lp->timestamp);
281 return (0);
282 }
283
284 /*
285 * __log_recover --
286 * Recover a log.
287 */
288 static int
__log_recover(dblp)289 __log_recover(dblp)
290 DB_LOG *dblp;
291 {
292 DBT dbt;
293 DB_ENV *dbenv;
294 DB_LOGC *logc;
295 DB_LSN lsn;
296 ENV *env;
297 LOG *lp;
298 u_int32_t cnt, rectype;
299 int ret;
300 logfile_validity status;
301
302 env = dblp->env;
303 dbenv = env->dbenv;
304 logc = NULL;
305 lp = dblp->reginfo.primary;
306
307 /*
308 * Find a log file. If none exist, we simply return, leaving
309 * everything initialized to a new log.
310 */
311 if ((ret = __log_find(dblp, 0, &cnt, &status)) != 0)
312 return (ret);
313 if (cnt == 0) {
314 if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
315 __db_msg(env, DB_STR("2525", "No log files found"));
316 return (0);
317 }
318
319 /*
320 * If the last file is an old, unreadable version, start a new
321 * file. Don't bother finding the end of the last log file;
322 * we assume that it's valid in its entirety, since the user
323 * should have shut down cleanly or run recovery before upgrading.
324 */
325 if (status == DB_LV_OLD_UNREADABLE) {
326 lp->lsn.file = lp->s_lsn.file = cnt + 1;
327 lp->lsn.offset = lp->s_lsn.offset = 0;
328 goto skipsearch;
329 }
330 DB_ASSERT(env,
331 (status == DB_LV_NORMAL || status == DB_LV_OLD_READABLE));
332
333 /*
334 * We have the last useful log file and we've loaded any persistent
335 * information. Set the end point of the log past the end of the last
336 * file. Read the last file, looking for the last checkpoint and
337 * the log's end.
338 */
339 lp->lsn.file = cnt + 1;
340 lp->lsn.offset = 0;
341 lsn.file = cnt;
342 lsn.offset = 0;
343
344 /*
345 * Allocate a cursor and set it to the first record. This shouldn't
346 * fail, leave error messages on.
347 */
348 if ((ret = __log_cursor(env, &logc)) != 0)
349 return (ret);
350 F_SET(logc, DB_LOG_LOCKED);
351 memset(&dbt, 0, sizeof(dbt));
352 if ((ret = __logc_get(logc, &lsn, &dbt, DB_SET)) != 0)
353 goto err;
354
355 /*
356 * Read to the end of the file. This may fail at some point, so
357 * turn off error messages.
358 */
359 F_SET(logc, DB_LOG_SILENT_ERR);
360 while (__logc_get(logc, &lsn, &dbt, DB_NEXT) == 0) {
361 if (dbt.size < sizeof(u_int32_t))
362 continue;
363 LOGCOPY_32(env, &rectype, dbt.data);
364 if (rectype == DB___txn_ckp)
365 /*
366 * If we happen to run into a checkpoint, cache its
367 * LSN so that the transaction system doesn't have
368 * to walk this log file again looking for it.
369 */
370 lp->cached_ckp_lsn = lsn;
371 }
372 F_CLR(logc, DB_LOG_SILENT_ERR);
373
374 /*
375 * We now know where the end of the log is. Set the first LSN that
376 * we want to return to an application and the LSN of the last known
377 * record on disk.
378 */
379 lp->lsn = lsn;
380 lp->s_lsn = lsn;
381 lp->lsn.offset += logc->len;
382 lp->s_lsn.offset += logc->len;
383
384 /* Set up the current buffer information, too. */
385 lp->len = logc->len;
386 lp->a_off = 0;
387 lp->b_off = 0;
388 lp->w_off = lp->lsn.offset;
389
390 skipsearch:
391 if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
392 __db_msg(env, DB_STR_A("2526",
393 "Finding last valid log LSN: file: %lu offset %lu",
394 "%lu %lu"), (u_long)lp->lsn.file, (u_long)lp->lsn.offset);
395
396 err: if (logc != NULL)
397 (void)__logc_close(logc);
398
399 return (ret);
400 }
401
402 /*
403 * __log_find --
404 * Try to find a log file. If find_first is set, valp will contain
405 * the number of the first readable log file, else it will contain the number
406 * of the last log file (which may be too old to read).
407 *
408 * PUBLIC: int __log_find __P((DB_LOG *, int, u_int32_t *, logfile_validity *));
409 */
410 int
__log_find(dblp,find_first,valp,statusp)411 __log_find(dblp, find_first, valp, statusp)
412 DB_LOG *dblp;
413 int find_first;
414 u_int32_t *valp;
415 logfile_validity *statusp;
416 {
417 ENV *env;
418 LOG *lp;
419 logfile_validity logval_status, status;
420 struct __db_filestart *filestart;
421 u_int32_t clv, logval;
422 int cnt, fcnt, ret;
423 const char *dir;
424 char *c, **names, *p, *q;
425
426 env = dblp->env;
427 lp = dblp->reginfo.primary;
428 logval_status = status = DB_LV_NONEXISTENT;
429
430 /* Return a value of 0 as the log file number on failure. */
431 *valp = 0;
432
433 if (lp->db_log_inmemory) {
434 filestart = find_first ?
435 SH_TAILQ_FIRST(&lp->logfiles, __db_filestart) :
436 SH_TAILQ_LAST(&lp->logfiles, links, __db_filestart);
437 if (filestart != NULL) {
438 *valp = filestart->file;
439 logval_status = DB_LV_NORMAL;
440 }
441 *statusp = logval_status;
442 return (0);
443 }
444
445 /* Find the directory name. */
446 if ((ret = __log_name(dblp, 1, &p, NULL, 0)) != 0) {
447 __os_free(env, p);
448 return (ret);
449 }
450 if ((q = __db_rpath(p)) == NULL)
451 dir = PATH_DOT;
452 else {
453 *q = '\0';
454 dir = p;
455 }
456
457 /* Get the list of file names. */
458 retry: if ((ret = __os_dirlist(env, dir, 0, &names, &fcnt)) != 0) {
459 __db_err(env, ret, "%s", dir);
460 __os_free(env, p);
461 return (ret);
462 }
463
464 /* Search for a valid log file name. */
465 for (cnt = fcnt, clv = logval = 0; --cnt >= 0;) {
466 if (!IS_LOG_FILE(names[cnt]))
467 continue;
468
469 /*
470 * Names of the form log\.[0-9]* are reserved for DB. Other
471 * names sharing LFPREFIX, such as "log.db", are legal.
472 */
473 for (c = names[cnt] + sizeof(LFPREFIX) - 1; *c != '\0'; c++)
474 if (!isdigit((int)*c))
475 break;
476 if (*c != '\0')
477 continue;
478
479 /*
480 * Use atol, not atoi; if an "int" is 16-bits, the largest
481 * log file name won't fit.
482 */
483 clv = (u_int32_t)atol(names[cnt] + (sizeof(LFPREFIX) - 1));
484
485 /*
486 * If searching for the first log file, we want to return the
487 * oldest log file we can read, or, if no readable log files
488 * exist, the newest log file we can't read (the crossover
489 * point between the old and new versions of the log file).
490 *
491 * If we're searching for the last log file, we want to return
492 * the newest log file, period.
493 *
494 * Readable log files should never precede unreadable log
495 * files, that would mean the admin seriously screwed up.
496 */
497 if (find_first) {
498 if (logval != 0 &&
499 status != DB_LV_OLD_UNREADABLE && clv > logval)
500 continue;
501 } else
502 if (logval != 0 && clv < logval)
503 continue;
504
505 if ((ret = __log_valid(dblp, clv, 1, NULL, 0,
506 &status, NULL)) != 0) {
507 /*
508 * If we have raced with removal of a log file since
509 * the call to __os_dirlist, it may no longer exist.
510 * In that case, just go on to the next one. If we're
511 * at the end of the list, all of the log files we saw
512 * initially are gone and we need to get the list again.
513 */
514 if (ret == ENOENT) {
515 ret = 0;
516 if (cnt == 0) {
517 __os_dirfree(env, names, fcnt);
518 goto retry;
519 }
520 continue;
521 }
522 __db_err(env, ret, DB_STR_A("2527",
523 "Invalid log file: %s", "%s"), names[cnt]);
524 goto err;
525 }
526 switch (status) {
527 case DB_LV_NONEXISTENT:
528 /* __log_valid never returns DB_LV_NONEXISTENT. */
529 DB_ASSERT(env, 0);
530 break;
531 case DB_LV_INCOMPLETE:
532 /*
533 * The last log file may not have been initialized --
534 * it's possible to create a log file but not write
535 * anything to it. If performing recovery (that is,
536 * if find_first isn't set), ignore the file, it's
537 * not interesting. If we're searching for the first
538 * log record, return the file (assuming we don't find
539 * something better), as the "real" first log record
540 * is likely to be in the log buffer, and we want to
541 * set the file LSN for our return.
542 */
543 if (find_first)
544 goto found;
545 break;
546 case DB_LV_OLD_UNREADABLE:
547 /*
548 * If we're searching for the first log file, then we
549 * only want this file if we don't yet have a file or
550 * already have an unreadable file and this one is
551 * newer than that one. If we're searching for the
552 * last log file, we always want this file because we
553 * wouldn't be here if it wasn't newer than our current
554 * choice.
555 */
556 if (!find_first || logval == 0 ||
557 (status == DB_LV_OLD_UNREADABLE && clv > logval))
558 goto found;
559 break;
560 case DB_LV_NORMAL:
561 case DB_LV_OLD_READABLE:
562 found: logval = clv;
563 logval_status = status;
564 break;
565 }
566 }
567
568 *valp = logval;
569
570 err: __os_dirfree(env, names, fcnt);
571 __os_free(env, p);
572 *statusp = logval_status;
573
574 return (ret);
575 }
576
577 /*
578 * log_valid --
579 * Validate a log file. Returns an error code in the event of
580 * a fatal flaw in a the specified log file; returns success with
581 * a code indicating the currentness and completeness of the specified
582 * log file if it is not unexpectedly flawed (that is, if it's perfectly
583 * normal, if it's zero-length, or if it's an old version).
584 *
585 * PUBLIC: int __log_valid __P((DB_LOG *, u_int32_t, int,
586 * PUBLIC: DB_FH **, u_int32_t, logfile_validity *, u_int32_t *));
587 */
588 int
__log_valid(dblp,number,set_persist,fhpp,flags,statusp,versionp)589 __log_valid(dblp, number, set_persist, fhpp, flags, statusp, versionp)
590 DB_LOG *dblp;
591 u_int32_t number;
592 int set_persist;
593 DB_FH **fhpp;
594 u_int32_t flags;
595 logfile_validity *statusp;
596 u_int32_t *versionp;
597 {
598 DB_CIPHER *db_cipher;
599 DB_FH *fhp;
600 ENV *env;
601 HDR *hdr;
602 LOG *lp;
603 LOGP *persist;
604 logfile_validity status;
605 size_t hdrsize, nr, recsize;
606 int chksum_includes_hdr, is_hmac, ret;
607 u_int32_t logversion;
608 u_int8_t *tmp;
609 char *fname;
610
611 env = dblp->env;
612 db_cipher = env->crypto_handle;
613 fhp = NULL;
614 persist = NULL;
615 status = DB_LV_NORMAL;
616 tmp = NULL;
617 #if defined(HAVE_LOG_CHECKSUM)
618 /* Most log versions include the hdr in the checksum. */
619 chksum_includes_hdr = 1;
620 #else
621 COMPQUIET(chksum_includes_hdr, 0);
622 #endif
623
624 /* Return the file handle to our caller, on request */
625 if (fhpp != NULL)
626 *fhpp = NULL;
627
628 if (flags == 0)
629 flags = DB_OSO_RDONLY | DB_OSO_SEQ;
630 /* Try to open the log file. */
631 if ((ret = __log_name(dblp, number, &fname, &fhp, flags)) != 0) {
632 __os_free(env, fname);
633 return (ret);
634 }
635
636 hdrsize = HDR_NORMAL_SZ;
637 is_hmac = 0;
638 recsize = sizeof(LOGP);
639 if (CRYPTO_ON(env)) {
640 hdrsize = HDR_CRYPTO_SZ;
641 recsize += db_cipher->adj_size(recsize);
642 is_hmac = 1;
643 }
644 if ((ret = __os_calloc(env, 1, recsize + hdrsize, &tmp)) != 0)
645 goto err;
646
647 hdr = (HDR *)tmp;
648 persist = (LOGP *)(tmp + hdrsize);
649
650 /*
651 * Try to read the header. This can fail if the log is truncated, or
652 * if we find a preallocated log file where the header has not yet been
653 * written, so we need to check whether the header is zero-filled.
654 */
655 if ((ret = __os_read(env, fhp, tmp, recsize + hdrsize, &nr)) != 0 ||
656 nr != recsize + hdrsize ||
657 (hdr->len == 0 && persist->magic == 0 && persist->log_size == 0)) {
658 if (ret == 0)
659 status = DB_LV_INCOMPLETE;
660 else
661 /*
662 * The error was a fatal read error, not just an
663 * incompletely initialized log file.
664 */
665 __db_err(env, ret, DB_STR_A("2528",
666 "ignoring log file: %s", "%s"), fname);
667 goto err;
668 }
669
670 if (LOG_SWAPPED(env))
671 __log_hdrswap(hdr, CRYPTO_ON(env));
672
673 /*
674 * Now we have to validate the persistent record. We have
675 * several scenarios we have to deal with:
676 *
677 * 1. User has crypto turned on:
678 * - They're reading an old, unencrypted log file
679 * . We will fail the record size match check below.
680 * - They're reading a current, unencrypted log file
681 * . We will fail the record size match check below.
682 * - They're reading an old, encrypted log file [NOT YET]
683 * . After decryption we'll fail the version check. [NOT YET]
684 * - They're reading a current, encrypted log file
685 * . We should proceed as usual.
686 * 2. User has crypto turned off:
687 * - They're reading an old, unencrypted log file
688 * . We will fail the version check.
689 * - They're reading a current, unencrypted log file
690 * . We should proceed as usual.
691 * - They're reading an old, encrypted log file [NOT YET]
692 * . We'll fail the magic number check (it is encrypted).
693 * - They're reading a current, encrypted log file
694 * . We'll fail the magic number check (it is encrypted).
695 */
696 if (CRYPTO_ON(env)) {
697 /*
698 * If we are trying to decrypt an unencrypted log
699 * we can only detect that by having an unreasonable
700 * data length for our persistent data.
701 */
702 if ((hdr->len - hdrsize) != recsize) {
703 __db_errx(env, "log record size mismatch");
704 goto err;
705 }
706 /*
707 * The checksum is calculated from the encrypted data, and,
708 * for recent logs, the fields hdr->{prev,len}.
709 */
710 #ifdef HAVE_LOG_CHECKSUM
711 if ((ret = __db_check_chksum(env, hdr, db_cipher,
712 &hdr->chksum[0], (u_int8_t *)persist,
713 hdr->len - hdrsize, is_hmac)) != 0) {
714 /*
715 * The checksum doesn't verify when the header fields
716 * are included; try without the header.
717 */
718
719 if ((ret = __db_check_chksum(env, NULL, db_cipher,
720 &hdr->chksum[0], (u_int8_t *)persist,
721 hdr->len - hdrsize, is_hmac)) != 0)
722 goto bad_checksum;
723 /*
724 * The checksum verifies without the header. Make note
725 * of that, because it is only acceptable when the log
726 * version < DB_LOGCHKSUM. Later, when we determine log
727 * version, we will confirm this.
728 */
729 chksum_includes_hdr = 0;
730 }
731 #endif
732
733 if ((ret = db_cipher->decrypt(env, db_cipher->data,
734 &hdr->iv[0], (u_int8_t *)persist, hdr->len - hdrsize)) != 0)
735 goto err;
736 }
737
738 /* Swap the header, if necessary. */
739 if (LOG_SWAPPED(env)) {
740 /*
741 * If the magic number is not byte-swapped, we're looking at an
742 * old log that we can no longer read.
743 */
744 if (persist->magic == DB_LOGMAGIC) {
745 __db_errx(env, DB_STR_A("2529",
746 "Ignoring log file: %s historic byte order",
747 "%s"), fname);
748 status = DB_LV_OLD_UNREADABLE;
749 goto err;
750 }
751
752 __log_persistswap(persist);
753 }
754
755 /* Validate the header. */
756 if (persist->magic != DB_LOGMAGIC) {
757 __db_errx(env, DB_STR_A("2530",
758 "Ignoring log file: %s: magic number %lx, not %lx",
759 "%s %lx %lx"), fname,
760 (u_long)persist->magic, (u_long)DB_LOGMAGIC);
761 ret = EINVAL;
762 goto err;
763 }
764
765 logversion = persist->version;
766 /*
767 * Set our status code to indicate whether the log file belongs to an
768 * unreadable or readable old version; leave it alone if and only if
769 * the log file version is the current one.
770 */
771 if (logversion > DB_LOGVERSION) {
772 /* This is a fatal error--the log file is newer than DB. */
773 __db_errx(env, DB_STR_A("2531",
774 "Unacceptable log file %s: unsupported log version %lu",
775 "%s %lu"), fname, (u_long)logversion);
776 ret = EINVAL;
777 goto err;
778 } else if (logversion < DB_LOGOLDVER) {
779 status = DB_LV_OLD_UNREADABLE;
780 /* This is a non-fatal error, but give some feedback. */
781 __db_errx(env, DB_STR_A("2532",
782 "Skipping log file %s: historic log version %lu", "%s %lu"),
783 fname, (u_long)logversion);
784 /*
785 * We don't want to set persistent info based on an unreadable
786 * region, so jump to "err".
787 */
788 goto err;
789 } else if (logversion < DB_LOGVERSION)
790 status = DB_LV_OLD_READABLE;
791
792 /*
793 * We could not check the checksum before checking the magic and version
794 * because old log headers put the length and checksum in a different
795 * location.
796 */
797 #ifdef HAVE_LOG_CHECKSUM
798 if (CRYPTO_ON(env)) {
799 /*
800 * We might have to declare a checksum failure here, if:
801 * - the checksum verified only by ignoring the header, and
802 * - the log version indicates that the header should have
803 * been included.
804 */
805 if (!chksum_includes_hdr && logversion >= DB_LOGCHKSUM)
806 goto bad_checksum;
807 } else {
808 /*
809 * The checksum was calculated with the swapped byte order. We
810 * might need to swap them back; the check needs the same bytes.
811 */
812 if (LOG_SWAPPED(env))
813 __log_persistswap(persist);
814 /*
815 * We have the logversion here, so we know whether to include
816 * the hdr or not.
817 */
818 if ((ret = __db_check_chksum(env,
819 logversion >= DB_LOGCHKSUM ? hdr : NULL, db_cipher,
820 &hdr->chksum[0], (u_int8_t *)persist,
821 hdr->len - hdrsize, is_hmac)) != 0) {
822 bad_checksum:
823 __db_errx(env, DB_STR("2533",
824 "log record checksum mismatch"));
825 goto err;
826 }
827
828 if (LOG_SWAPPED(env))
829 __log_persistswap(persist);
830 }
831 #endif
832
833 /*
834 * If the log is readable so far and we're doing system initialization,
835 * set the region's persistent information based on the headers.
836 *
837 * Override the current log file size.
838 */
839 if (set_persist) {
840 lp = dblp->reginfo.primary;
841 lp->log_size = persist->log_size;
842 lp->persist.version = logversion;
843 }
844 if (versionp != NULL)
845 *versionp = logversion;
846
847 err: if (fname != NULL)
848 __os_free(env, fname);
849 if (ret == 0 && fhpp != NULL)
850 *fhpp = fhp;
851 else
852 /* Must close on error or if we only used it locally. */
853 (void)__os_closehandle(env, fhp);
854 if (tmp != NULL)
855 __os_free(env, tmp);
856
857 if (statusp != NULL)
858 *statusp = status;
859
860 return (ret);
861 }
862
863 /*
864 * __log_env_refresh --
865 * Clean up after the log system on a close or failed open.
866 *
867 * PUBLIC: int __log_env_refresh __P((ENV *));
868 */
869 int
__log_env_refresh(env)870 __log_env_refresh(env)
871 ENV *env;
872 {
873 DB_LOG *dblp;
874 LOG *lp;
875 REGINFO *reginfo;
876 struct __fname *fnp;
877 struct __db_commit *commit;
878 struct __db_filestart *filestart;
879 int ret, t_ret;
880
881 dblp = env->lg_handle;
882 reginfo = &dblp->reginfo;
883 lp = reginfo->primary;
884 ret = 0;
885
886 /*
887 * Flush the log if it's private -- there's no Berkeley DB guarantee
888 * that this gets done, but in case the application has forgotten to
889 * flush for durability, it's the polite thing to do.
890 */
891 if (F_ISSET(env, ENV_PRIVATE) &&
892 (t_ret = __log_flush(env, NULL)) != 0 && ret == 0)
893 ret = t_ret;
894
895 if ((t_ret = __dbreg_close_files(env, 0)) != 0 && ret == 0)
896 ret = t_ret;
897
898 /*
899 * After we close the files, check for any unlogged closes left in
900 * the shared memory queue. If we find any, try to log it, otherwise
901 * return the error. We cannot say the environment was closed
902 * cleanly.
903 */
904 MUTEX_LOCK(env, lp->mtx_filelist);
905 SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname)
906 if (F_ISSET(fnp, DB_FNAME_NOTLOGGED) &&
907 (t_ret = __dbreg_close_id_int(
908 env, fnp, DBREG_CLOSE, 1)) != 0)
909 ret = t_ret;
910 MUTEX_UNLOCK(env, lp->mtx_filelist);
911
912 /*
913 * If a private region, return the memory to the heap. Not needed for
914 * filesystem-backed or system shared memory regions, that memory isn't
915 * owned by any particular process.
916 */
917 if (F_ISSET(env, ENV_PRIVATE)) {
918 reginfo->mtx_alloc = MUTEX_INVALID;
919 /* Discard the flush mutex. */
920 if ((t_ret =
921 __mutex_free(env, &lp->mtx_flush)) != 0 && ret == 0)
922 ret = t_ret;
923
924 /* Discard the buffer. */
925 __env_alloc_free(reginfo, R_ADDR(reginfo, lp->buffer_off));
926
927 /* Discard stack of free file IDs. */
928 if (lp->free_fid_stack != INVALID_ROFF)
929 __env_alloc_free(reginfo,
930 R_ADDR(reginfo, lp->free_fid_stack));
931
932 /* Discard the list of in-memory log file markers. */
933 while ((filestart = SH_TAILQ_FIRST(&lp->logfiles,
934 __db_filestart)) != NULL) {
935 SH_TAILQ_REMOVE(&lp->logfiles, filestart, links,
936 __db_filestart);
937 __env_alloc_free(reginfo, filestart);
938 }
939
940 while ((filestart = SH_TAILQ_FIRST(&lp->free_logfiles,
941 __db_filestart)) != NULL) {
942 SH_TAILQ_REMOVE(&lp->free_logfiles, filestart, links,
943 __db_filestart);
944 __env_alloc_free(reginfo, filestart);
945 }
946
947 /* Discard commit queue elements. */
948 while ((commit = SH_TAILQ_FIRST(&lp->free_commits,
949 __db_commit)) != NULL) {
950 SH_TAILQ_REMOVE(&lp->free_commits, commit, links,
951 __db_commit);
952 __env_alloc_free(reginfo, commit);
953 }
954
955 /* Discard replication bulk buffer. */
956 if (lp->bulk_buf != INVALID_ROFF) {
957 __env_alloc_free(reginfo,
958 R_ADDR(reginfo, lp->bulk_buf));
959 lp->bulk_buf = INVALID_ROFF;
960 }
961 }
962
963 /* Discard the per-thread DBREG mutex. */
964 if ((t_ret = __mutex_free(env, &dblp->mtx_dbreg)) != 0 && ret == 0)
965 ret = t_ret;
966
967 /* Detach from the region. */
968 if ((t_ret = __env_region_detach(env, reginfo, 0)) != 0 && ret == 0)
969 ret = t_ret;
970
971 /* Close open files, release allocated memory. */
972 if (dblp->lfhp != NULL) {
973 if ((t_ret =
974 __os_closehandle(env, dblp->lfhp)) != 0 && ret == 0)
975 ret = t_ret;
976 dblp->lfhp = NULL;
977 }
978 if (dblp->dbentry != NULL)
979 __os_free(env, dblp->dbentry);
980
981 __os_free(env, dblp);
982
983 env->lg_handle = NULL;
984 return (ret);
985 }
986
987 /*
988 * __log_get_cached_ckp_lsn --
989 * Retrieve any last checkpoint LSN that we may have found on startup.
990 *
991 * PUBLIC: int __log_get_cached_ckp_lsn __P((ENV *, DB_LSN *));
992 */
993 int
__log_get_cached_ckp_lsn(env,ckp_lsnp)994 __log_get_cached_ckp_lsn(env, ckp_lsnp)
995 ENV *env;
996 DB_LSN *ckp_lsnp;
997 {
998 DB_LOG *dblp;
999 LOG *lp;
1000
1001 dblp = env->lg_handle;
1002 lp = (LOG *)dblp->reginfo.primary;
1003
1004 LOG_SYSTEM_LOCK(env);
1005 *ckp_lsnp = lp->cached_ckp_lsn;
1006 LOG_SYSTEM_UNLOCK(env);
1007
1008 return (0);
1009 }
1010
1011 /*
1012 * __log_region_mutex_count --
1013 * Return the number of mutexes the log region will need.
1014 *
1015 * PUBLIC: u_int32_t __log_region_mutex_count __P((ENV *));
1016 */
1017 u_int32_t
__log_region_mutex_count(env)1018 __log_region_mutex_count(env)
1019 ENV *env;
1020 {
1021 /*
1022 * We need a few assorted mutexes, and one per transaction waiting
1023 * on the group commit list. We can't know how many that will be,
1024 * but it should be bounded by the maximum active transactions.
1025 */
1026 return (env->dbenv->tx_init + 5);
1027 }
1028
1029 /*
1030 * __log_region_mutex_max --
1031 * Return the number of additional mutexes the log region will need.
1032 *
1033 * PUBLIC: u_int32_t __log_region_mutex_max __P((ENV *));
1034 */
1035 u_int32_t
__log_region_mutex_max(env)1036 __log_region_mutex_max(env)
1037 ENV *env;
1038 {
1039 DB_ENV *dbenv;
1040 u_int32_t count;
1041
1042 dbenv = env->dbenv;
1043
1044 if ((count = dbenv->tx_max) == 0)
1045 count = DEF_MAX_TXNS;
1046 if (count < dbenv->tx_init)
1047 return (0);
1048 return (count - dbenv->tx_init);
1049 }
1050
1051 /*
1052 * __log_region_size --
1053 * Return the amount of space needed for the log region.
1054 * Make the region large enough to hold txn_max transaction
1055 * detail structures plus some space to hold thread handles
1056 * and the beginning of the alloc region and anything we
1057 * need for mutex system resource recording.
1058 * PUBLIC: size_t __log_region_size __P((ENV *));
1059 */
1060 size_t
__log_region_size(env)1061 __log_region_size(env)
1062 ENV *env;
1063 {
1064 DB_ENV *dbenv;
1065 size_t s;
1066
1067 dbenv = env->dbenv;
1068
1069 /* Set the default buffer size, if not otherwise configured. */
1070 if (dbenv->lg_bsize == 0)
1071 dbenv->lg_bsize = FLD_ISSET(dbenv->lg_flags, DB_LOG_IN_MEMORY) ?
1072 LG_BSIZE_INMEM : LG_BSIZE_DEFAULT;
1073
1074 s = dbenv->lg_bsize;
1075 /* Allocate the initial fileid allocation, plus some path name space. */
1076 s += dbenv->lg_fileid_init * __env_alloc_size((sizeof(FNAME)) + 16);
1077
1078 return (s);
1079 }
1080 /*
1081 * __log_region_max --
1082 * Return the amount of extra memory to allocate for logging informaition.
1083 * PUBLIC: size_t __log_region_max __P((ENV *));
1084 */
1085 size_t
__log_region_max(env)1086 __log_region_max(env)
1087 ENV *env;
1088 {
1089
1090 DB_ENV *dbenv;
1091 size_t s;
1092
1093 dbenv = env->dbenv;
1094 if (dbenv->lg_fileid_init == 0) {
1095 if ((s = dbenv->lg_regionmax) == 0)
1096 s = LG_BASE_REGION_SIZE;
1097 } else if ((s = dbenv->lg_regionmax) != 0 &&
1098 s < dbenv->lg_fileid_init * (__env_alloc_size(sizeof(FNAME)) + 16))
1099 s = 0;
1100 else if (s != 0)
1101 s -= dbenv->lg_fileid_init *
1102 (__env_alloc_size(sizeof(FNAME)) + 16);
1103
1104 return (s);
1105 }
1106
1107 /*
1108 * __log_vtruncate
1109 * This is a virtual truncate. We set up the log indicators to
1110 * make everyone believe that the given record is the last one in the
1111 * log. Returns with the next valid LSN (i.e., the LSN of the next
1112 * record to be written). This is used in replication to discard records
1113 * in the log file that do not agree with the master.
1114 *
1115 * PUBLIC: int __log_vtruncate __P((ENV *, DB_LSN *, DB_LSN *, DB_LSN *));
1116 */
1117 int
__log_vtruncate(env,lsn,ckplsn,trunclsn)1118 __log_vtruncate(env, lsn, ckplsn, trunclsn)
1119 ENV *env;
1120 DB_LSN *lsn, *ckplsn, *trunclsn;
1121 {
1122 DBT log_dbt;
1123 DB_LOG *dblp;
1124 DB_LOGC *logc;
1125 LOG *lp;
1126 u_int32_t bytes, len;
1127 size_t offset;
1128 int ret, t_ret;
1129
1130 /* Need to find out the length of this soon-to-be-last record. */
1131 if ((ret = __log_cursor(env, &logc)) != 0)
1132 return (ret);
1133 memset(&log_dbt, 0, sizeof(log_dbt));
1134 ret = __logc_get(logc, lsn, &log_dbt, DB_SET);
1135 len = logc->len;
1136 if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
1137 ret = t_ret;
1138 if (ret != 0)
1139 return (ret);
1140
1141 /* Now do the truncate. */
1142 dblp = env->lg_handle;
1143 lp = (LOG *)dblp->reginfo.primary;
1144
1145 LOG_SYSTEM_LOCK(env);
1146
1147 /*
1148 * Flush the log so we can simply initialize the in-memory buffer
1149 * after the truncate.
1150 */
1151 if ((ret = __log_flush_int(dblp, NULL, 0)) != 0)
1152 goto err;
1153
1154 lp->lsn = *lsn;
1155 lp->len = len;
1156 lp->lsn.offset += lp->len;
1157
1158 offset = lp->b_off;
1159 if (lp->db_log_inmemory && (ret =
1160 __log_inmem_lsnoff(dblp, &lp->lsn, &offset)) != 0) {
1161 lp->b_off = (db_size_t)offset;
1162 goto err;
1163 }
1164 lp->b_off = (db_size_t)offset;
1165
1166 /*
1167 * I am going to assume that the number of bytes written since
1168 * the last checkpoint doesn't exceed a 32-bit number.
1169 */
1170 DB_ASSERT(env, lp->lsn.file >= ckplsn->file);
1171 bytes = 0;
1172 if (ckplsn->file != lp->lsn.file) {
1173 bytes = lp->log_size - ckplsn->offset;
1174 if (lp->lsn.file > ckplsn->file + 1)
1175 bytes += lp->log_size *
1176 ((lp->lsn.file - ckplsn->file) - 1);
1177 bytes += lp->lsn.offset;
1178 } else
1179 bytes = lp->lsn.offset - ckplsn->offset;
1180
1181 lp->stat.st_wc_mbytes += bytes / MEGABYTE;
1182 lp->stat.st_wc_bytes += bytes % MEGABYTE;
1183
1184 /*
1185 * If the synced lsn is greater than our new end of log, reset it
1186 * to our current end of log.
1187 */
1188 MUTEX_LOCK(env, lp->mtx_flush);
1189 if (LOG_COMPARE(&lp->s_lsn, lsn) > 0)
1190 lp->s_lsn = lp->lsn;
1191 MUTEX_UNLOCK(env, lp->mtx_flush);
1192
1193 /* Initialize the in-region buffer to a pristine state. */
1194 ZERO_LSN(lp->f_lsn);
1195 lp->w_off = lp->lsn.offset;
1196
1197 if (trunclsn != NULL)
1198 *trunclsn = lp->lsn;
1199
1200 /* Truncate the log to the new point. */
1201 if ((ret = __log_zero(env, &lp->lsn)) != 0)
1202 goto err;
1203
1204 err: LOG_SYSTEM_UNLOCK(env);
1205 return (ret);
1206 }
1207
1208 /*
1209 * __log_is_outdated --
1210 * Used by the replication system to identify if a client's logs are too
1211 * old.
1212 *
1213 * PUBLIC: int __log_is_outdated __P((ENV *, u_int32_t, int *));
1214 */
1215 int
__log_is_outdated(env,fnum,outdatedp)1216 __log_is_outdated(env, fnum, outdatedp)
1217 ENV *env;
1218 u_int32_t fnum;
1219 int *outdatedp;
1220 {
1221 DB_LOG *dblp;
1222 LOG *lp;
1223 char *name;
1224 int ret;
1225 u_int32_t cfile;
1226 struct __db_filestart *filestart;
1227
1228 dblp = env->lg_handle;
1229
1230 /*
1231 * The log represented by env is compared to the file number passed
1232 * in fnum. If the log file fnum does not exist and is lower-numbered
1233 * than the current logs, return *outdatedp non-zero, else we return 0.
1234 */
1235 if (FLD_ISSET(env->dbenv->lg_flags, DB_LOG_IN_MEMORY)) {
1236 LOG_SYSTEM_LOCK(env);
1237 lp = (LOG *)dblp->reginfo.primary;
1238 filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1239 *outdatedp = filestart == NULL ? 0 : (fnum < filestart->file);
1240 LOG_SYSTEM_UNLOCK(env);
1241 return (0);
1242 }
1243
1244 *outdatedp = 0;
1245 if ((ret = __log_name(dblp, fnum, &name, NULL, 0)) != 0) {
1246 __os_free(env, name);
1247 return (ret);
1248 }
1249
1250 /* If the file exists, we're just fine. */
1251 if (__os_exists(env, name, NULL) == 0)
1252 goto out;
1253
1254 /*
1255 * It didn't exist, decide if the file number is too big or
1256 * too little. If it's too little, then we need to indicate
1257 * that the LSN is outdated.
1258 */
1259 LOG_SYSTEM_LOCK(env);
1260 lp = (LOG *)dblp->reginfo.primary;
1261 cfile = lp->lsn.file;
1262 LOG_SYSTEM_UNLOCK(env);
1263
1264 if (cfile > fnum)
1265 *outdatedp = 1;
1266 out: __os_free(env, name);
1267 return (ret);
1268 }
1269
1270 /*
1271 * __log_zero --
1272 * Zero out the tail of a log after a truncate.
1273 *
1274 * PUBLIC: int __log_zero __P((ENV *, DB_LSN *));
1275 */
1276 int
__log_zero(env,from_lsn)1277 __log_zero(env, from_lsn)
1278 ENV *env;
1279 DB_LSN *from_lsn;
1280 {
1281 DB_FH *fhp;
1282 DB_LOG *dblp;
1283 LOG *lp;
1284 struct __db_filestart *filestart, *nextstart;
1285 size_t nbytes, len, nw;
1286 u_int32_t fn, mbytes, bytes;
1287 u_int8_t buf[4096];
1288 int ret;
1289 char *fname;
1290
1291 dblp = env->lg_handle;
1292 lp = (LOG *)dblp->reginfo.primary;
1293 DB_ASSERT(env, LOG_COMPARE(from_lsn, &lp->lsn) <= 0);
1294 if (LOG_COMPARE(from_lsn, &lp->lsn) > 0) {
1295 __db_errx(env, DB_STR("2534",
1296 "Warning: truncating to point beyond end of log"));
1297 return (0);
1298 }
1299
1300 if (lp->db_log_inmemory) {
1301 /*
1302 * Remove the files that are invalidated by this truncate.
1303 */
1304 for (filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1305 filestart != NULL; filestart = nextstart) {
1306 nextstart = SH_TAILQ_NEXT(filestart,
1307 links, __db_filestart);
1308 if (filestart->file > from_lsn->file) {
1309 SH_TAILQ_REMOVE(&lp->logfiles,
1310 filestart, links, __db_filestart);
1311 SH_TAILQ_INSERT_HEAD(&lp->free_logfiles,
1312 filestart, links, __db_filestart);
1313 }
1314 }
1315
1316 return (0);
1317 }
1318
1319 /* Close any open file handles so unlinks don't fail. */
1320 if (dblp->lfhp != NULL) {
1321 (void)__os_closehandle(env, dblp->lfhp);
1322 dblp->lfhp = NULL;
1323 }
1324
1325 /* Throw away any extra log files that we have around. */
1326 for (fn = from_lsn->file + 1;; fn++) {
1327 if (__log_name(dblp, fn, &fname, &fhp, DB_OSO_RDONLY) != 0) {
1328 __os_free(env, fname);
1329 break;
1330 }
1331 (void)__os_closehandle(env, fhp);
1332 (void)time(&lp->timestamp);
1333 ret = __os_unlink(env, fname, 0);
1334 __os_free(env, fname);
1335 if (ret != 0)
1336 return (ret);
1337 }
1338
1339 /* We removed some log files; have to 0 to end of file. */
1340 if ((ret =
1341 __log_name(dblp, from_lsn->file, &fname, &dblp->lfhp, 0)) != 0) {
1342 __os_free(env, fname);
1343 return (ret);
1344 }
1345 __os_free(env, fname);
1346 if ((ret = __os_ioinfo(env,
1347 NULL, dblp->lfhp, &mbytes, &bytes, NULL)) != 0)
1348 goto err;
1349 DB_ASSERT(env, (mbytes * MEGABYTE + bytes) >= from_lsn->offset);
1350 len = (mbytes * MEGABYTE + bytes) - from_lsn->offset;
1351
1352 memset(buf, 0, sizeof(buf));
1353
1354 /* Initialize the write position. */
1355 if ((ret = __os_seek(env, dblp->lfhp, 0, 0, from_lsn->offset)) != 0)
1356 goto err;
1357
1358 while (len > 0) {
1359 nbytes = len > sizeof(buf) ? sizeof(buf) : len;
1360 if ((ret =
1361 __os_write(env, dblp->lfhp, buf, nbytes, &nw)) != 0)
1362 goto err;
1363 len -= nbytes;
1364 }
1365
1366 err: (void)__os_closehandle(env, dblp->lfhp);
1367 dblp->lfhp = NULL;
1368
1369 return (ret);
1370 }
1371
1372 /*
1373 * __log_inmem_lsnoff --
1374 * Find the offset in the buffer of a given LSN.
1375 *
1376 * PUBLIC: int __log_inmem_lsnoff __P((DB_LOG *, DB_LSN *, size_t *));
1377 */
1378 int
__log_inmem_lsnoff(dblp,lsnp,offsetp)1379 __log_inmem_lsnoff(dblp, lsnp, offsetp)
1380 DB_LOG *dblp;
1381 DB_LSN *lsnp;
1382 size_t *offsetp;
1383 {
1384 LOG *lp;
1385 struct __db_filestart *filestart;
1386
1387 lp = (LOG *)dblp->reginfo.primary;
1388
1389 SH_TAILQ_FOREACH(filestart, &lp->logfiles, links, __db_filestart)
1390 if (filestart->file == lsnp->file) {
1391 *offsetp = (u_int32_t)
1392 (filestart->b_off + lsnp->offset) % lp->buffer_size;
1393 return (0);
1394 }
1395
1396 return (DB_NOTFOUND);
1397 }
1398
1399 /*
1400 * __log_inmem_newfile --
1401 * Records the offset of the beginning of a new file in the in-memory
1402 * buffer.
1403 *
1404 * PUBLIC: int __log_inmem_newfile __P((DB_LOG *, u_int32_t));
1405 */
1406 int
__log_inmem_newfile(dblp,file)1407 __log_inmem_newfile(dblp, file)
1408 DB_LOG *dblp;
1409 u_int32_t file;
1410 {
1411 HDR hdr;
1412 LOG *lp;
1413 struct __db_filestart *filestart;
1414 int ret;
1415 #ifdef DIAGNOSTIC
1416 struct __db_filestart *first, *last;
1417 #endif
1418
1419 lp = (LOG *)dblp->reginfo.primary;
1420
1421 /*
1422 * If the log buffer is empty, reuse the filestart entry.
1423 */
1424 filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1425 if (filestart != NULL &&
1426 RINGBUF_LEN(lp, filestart->b_off, lp->b_off) <=
1427 sizeof(HDR) + sizeof(LOGP)) {
1428 filestart->file = file;
1429 filestart->b_off = lp->b_off;
1430 return (0);
1431 }
1432
1433 /*
1434 * We write an empty header at the end of every in-memory log file.
1435 * This is used during cursor traversal to indicate when to switch the
1436 * LSN to the next file.
1437 */
1438 if (file > 1) {
1439 memset(&hdr, 0, sizeof(HDR));
1440 __log_inmem_copyin(dblp, lp->b_off, &hdr, sizeof(HDR));
1441 lp->b_off = (lp->b_off + sizeof(HDR)) % lp->buffer_size;
1442 }
1443
1444 filestart = SH_TAILQ_FIRST(&lp->free_logfiles, __db_filestart);
1445 if (filestart == NULL) {
1446 if ((ret = __env_alloc(&dblp->reginfo,
1447 sizeof(struct __db_filestart), &filestart)) != 0)
1448 return (ret);
1449 memset(filestart, 0, sizeof(*filestart));
1450 } else
1451 SH_TAILQ_REMOVE(&lp->free_logfiles, filestart,
1452 links, __db_filestart);
1453
1454 filestart->file = file;
1455 filestart->b_off = lp->b_off;
1456
1457 #ifdef DIAGNOSTIC
1458 first = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1459 last = SH_TAILQ_LAST(&(lp)->logfiles, links, __db_filestart);
1460
1461 /* Check that we don't wrap. */
1462 DB_ASSERT(dblp->env, !first || first == last ||
1463 RINGBUF_LEN(lp, first->b_off, lp->b_off) ==
1464 RINGBUF_LEN(lp, first->b_off, last->b_off) +
1465 RINGBUF_LEN(lp, last->b_off, lp->b_off));
1466 #endif
1467
1468 SH_TAILQ_INSERT_TAIL(&lp->logfiles, filestart, links);
1469 return (0);
1470 }
1471
1472 /*
1473 * __log_inmem_chkspace --
1474 * Ensure that the requested amount of space is available in the buffer,
1475 * and invalidate the region.
1476 * Note: assumes that the region lock is held on entry.
1477 *
1478 * PUBLIC: int __log_inmem_chkspace __P((DB_LOG *, size_t));
1479 */
1480 int
__log_inmem_chkspace(dblp,len)1481 __log_inmem_chkspace(dblp, len)
1482 DB_LOG *dblp;
1483 size_t len;
1484 {
1485 DB_LSN active_lsn, old_active_lsn;
1486 ENV *env;
1487 LOG *lp;
1488 struct __db_filestart *filestart;
1489 size_t offset;
1490 int ret;
1491
1492 env = dblp->env;
1493 lp = dblp->reginfo.primary;
1494
1495 DB_ASSERT(env, lp->db_log_inmemory);
1496
1497 /*
1498 * Allow room for an extra header so that we don't need to check for
1499 * space when switching files.
1500 */
1501 len += sizeof(HDR);
1502
1503 /*
1504 * If transactions are enabled and we're about to fill available space,
1505 * update the active LSN and recheck. If transactions aren't enabled,
1506 * don't even bother checking: in that case we can always overwrite old
1507 * log records, because we're never going to abort.
1508 */
1509 while (TXN_ON(env) &&
1510 RINGBUF_LEN(lp, lp->b_off, lp->a_off) <= len) {
1511 old_active_lsn = lp->active_lsn;
1512 active_lsn = lp->lsn;
1513
1514 /*
1515 * Drop the log region lock so we don't hold it while
1516 * taking the transaction region lock.
1517 */
1518 LOG_SYSTEM_UNLOCK(env);
1519 ret = __txn_getactive(env, &active_lsn);
1520 LOG_SYSTEM_LOCK(env);
1521 if (ret != 0)
1522 return (ret);
1523 active_lsn.offset = 0;
1524
1525 /* If we didn't make any progress, give up. */
1526 if (LOG_COMPARE(&active_lsn, &old_active_lsn) == 0) {
1527 __db_errx(env, DB_STR("2535",
1528 "In-memory log buffer is full (an active transaction spans the buffer)"));
1529 return (DB_LOG_BUFFER_FULL);
1530 }
1531
1532 /* Make sure we're moving the region LSN forwards. */
1533 if (LOG_COMPARE(&active_lsn, &lp->active_lsn) > 0) {
1534 lp->active_lsn = active_lsn;
1535 offset = lp->a_off;
1536 (void)__log_inmem_lsnoff(dblp, &active_lsn, &offset);
1537 lp->a_off = (db_size_t)offset;
1538 }
1539 }
1540
1541 /*
1542 * Remove the first file if it is invalidated by this write.
1543 * Log records can't be bigger than a file, so we only need to
1544 * check the first file.
1545 */
1546 filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1547 if (filestart != NULL &&
1548 RINGBUF_LEN(lp, lp->b_off, filestart->b_off) <= len) {
1549 SH_TAILQ_REMOVE(&lp->logfiles, filestart,
1550 links, __db_filestart);
1551 SH_TAILQ_INSERT_HEAD(&lp->free_logfiles, filestart,
1552 links, __db_filestart);
1553 lp->f_lsn.file = filestart->file + 1;
1554 }
1555
1556 return (0);
1557 }
1558
1559 /*
1560 * __log_inmem_copyout --
1561 * Copies the given number of bytes from the buffer -- no checking.
1562 * Note: assumes that the region lock is held on entry.
1563 *
1564 * PUBLIC: void __log_inmem_copyout __P((DB_LOG *, size_t, void *, size_t));
1565 */
1566 void
__log_inmem_copyout(dblp,offset,buf,size)1567 __log_inmem_copyout(dblp, offset, buf, size)
1568 DB_LOG *dblp;
1569 size_t offset;
1570 void *buf;
1571 size_t size;
1572 {
1573 LOG *lp;
1574 size_t nbytes;
1575
1576 lp = (LOG *)dblp->reginfo.primary;
1577 nbytes = (offset + size < lp->buffer_size) ?
1578 size : lp->buffer_size - offset;
1579 memcpy(buf, dblp->bufp + offset, nbytes);
1580 if (nbytes < size)
1581 memcpy((u_int8_t *)buf + nbytes, dblp->bufp, size - nbytes);
1582 }
1583
1584 /*
1585 * __log_inmem_copyin --
1586 * Copies the given number of bytes into the buffer -- no checking.
1587 * Note: assumes that the region lock is held on entry.
1588 *
1589 * PUBLIC: void __log_inmem_copyin __P((DB_LOG *, size_t, void *, size_t));
1590 */
1591 void
__log_inmem_copyin(dblp,offset,buf,size)1592 __log_inmem_copyin(dblp, offset, buf, size)
1593 DB_LOG *dblp;
1594 size_t offset;
1595 void *buf;
1596 size_t size;
1597 {
1598 LOG *lp;
1599 size_t nbytes;
1600
1601 lp = (LOG *)dblp->reginfo.primary;
1602 nbytes = (offset + size < lp->buffer_size) ?
1603 size : lp->buffer_size - offset;
1604 memcpy(dblp->bufp + offset, buf, nbytes);
1605 if (nbytes < size)
1606 memcpy(dblp->bufp, (u_int8_t *)buf + nbytes, size - nbytes);
1607 }
1608
1609 /*
1610 * __log_set_version --
1611 * Sets the current version of the log subsystem to the given version.
1612 * Essentially this modifies the lp->persist.version field in the
1613 * shared memory region. Called when region is initially created
1614 * and when replication is starting up or finds a new master.
1615 *
1616 * PUBLIC: void __log_set_version __P((ENV *, u_int32_t));
1617 */
1618 void
__log_set_version(env,newver)1619 __log_set_version(env, newver)
1620 ENV *env;
1621 u_int32_t newver;
1622 {
1623 DB_LOG *dblp;
1624 LOG *lp;
1625
1626 dblp = env->lg_handle;
1627 lp = (LOG *)dblp->reginfo.primary;
1628 /*
1629 * We should be able to update this atomically without locking.
1630 */
1631 lp->persist.version = newver;
1632 }
1633
1634 /*
1635 * __log_get_oldversion --
1636 * Returns the last version of log that this environment was working
1637 * with. Since there could be several versions of log files, if
1638 * the user upgraded and didn't log archive, we check the version
1639 * of the first log file, compare it to the last log file. If those
1640 * are different, then there is an older log existing, and we then
1641 * walk backward in the log files looking for the version of the
1642 * most recent older log file.
1643 *
1644 * PUBLIC: int __log_get_oldversion __P((ENV *, u_int32_t *));
1645 */
1646 int
__log_get_oldversion(env,ver)1647 __log_get_oldversion(env, ver)
1648 ENV *env;
1649 u_int32_t *ver;
1650 {
1651 DBT rec;
1652 DB_LOG *dblp;
1653 DB_LOGC *logc;
1654 DB_LSN lsn;
1655 LOG *lp;
1656 u_int32_t firstfnum, fnum, lastver, oldver;
1657 int ret, t_ret;
1658
1659 dblp = env->lg_handle;
1660 lp = dblp->reginfo.primary;
1661
1662 logc = NULL;
1663 ret = 0;
1664 oldver = DB_LOGVERSION;
1665 /*
1666 * If we're in-memory logs we're always the current version.
1667 */
1668 if (lp->db_log_inmemory) {
1669 *ver = oldver;
1670 return (0);
1671 }
1672 memset(&rec, 0, sizeof(rec));
1673 if ((ret = __log_cursor(env, &logc)) != 0)
1674 goto err;
1675 /*
1676 * Get the version numbers of the first and last log files.
1677 */
1678 if ((ret = __logc_get(logc, &lsn, &rec, DB_FIRST)) != 0) {
1679 /*
1680 * If there is no log file, we'll get DB_NOTFOUND.
1681 * If we get that, set the version to the current.
1682 */
1683 if (ret == DB_NOTFOUND)
1684 ret = 0;
1685 goto err;
1686 }
1687 firstfnum = lsn.file;
1688 if ((ret = __logc_get(logc, &lsn, &rec, DB_LAST)) != 0)
1689 goto err;
1690 if ((ret = __log_valid(dblp, firstfnum, 0, NULL, 0,
1691 NULL, &oldver)) != 0)
1692 goto err;
1693 /*
1694 * If the first and last LSN are in the same file, then we
1695 * already have the version in oldver. Return it.
1696 */
1697 if (firstfnum == lsn.file)
1698 goto err;
1699
1700 /*
1701 * Otherwise they're in different files and we call __log_valid
1702 * to get the version numbers in both files.
1703 */
1704 if ((ret = __log_valid(dblp, lsn.file, 0, NULL, 0,
1705 NULL, &lastver)) != 0)
1706 goto err;
1707 /*
1708 * If the version numbers are different, walk backward getting
1709 * the version of each log file until we find one that is
1710 * different than the last.
1711 */
1712 if (oldver != lastver) {
1713 for (fnum = lsn.file - 1; fnum >= firstfnum; fnum--) {
1714 if ((ret = __log_valid(dblp, fnum, 0, NULL, 0,
1715 NULL, &oldver)) != 0)
1716 goto err;
1717 if (oldver != lastver)
1718 break;
1719 }
1720 }
1721 err: if (logc != NULL && ((t_ret = __logc_close(logc)) != 0) && ret == 0)
1722 ret = t_ret;
1723 if (ret == 0 && ver != NULL)
1724 *ver = oldver;
1725 return (ret);
1726 }
1727