1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 1996, 2013 Oracle and/or its affiliates.  All rights reserved.
5  *
6  * $Id$
7  */
8 
9 #include "db_config.h"
10 
11 #include "db_int.h"
12 #include "dbinc/crypto.h"
13 #include "dbinc/hmac.h"
14 #include "dbinc/log.h"
15 #include "dbinc/txn.h"
16 #include "dbinc/db_page.h"
17 #include "dbinc/db_am.h"
18 
19 static int	__log_init __P((ENV *, DB_LOG *));
20 static int	__log_recover __P((DB_LOG *));
21 
22 /*
23  * __log_open --
24  *	Internal version of log_open: only called from ENV->open.
25  *
26  * PUBLIC: int __log_open __P((ENV *));
27  */
28 int
__log_open(env)29 __log_open(env)
30 	ENV *env;
31 {
32 	DB_ENV *dbenv;
33 	DB_LOG *dblp;
34 	LOG *lp;
35 	u_int8_t *bulk;
36 	int region_locked, ret;
37 
38 	dbenv = env->dbenv;
39 	region_locked = 0;
40 
41 	/* Create/initialize the DB_LOG structure. */
42 	if ((ret = __os_calloc(env, 1, sizeof(DB_LOG), &dblp)) != 0)
43 		return (ret);
44 	dblp->env = env;
45 
46 	/* Join/create the log region. */
47 	if ((ret = __env_region_share(env, &dblp->reginfo)) != 0)
48 		goto err;
49 
50 	/* If we created the region, initialize it. */
51 	if (F_ISSET(&dblp->reginfo, REGION_CREATE))
52 		if ((ret = __log_init(env, dblp)) != 0)
53 			goto err;
54 
55 	/* Set the local addresses. */
56 	lp = dblp->reginfo.primary = R_ADDR(&dblp->reginfo,
57 	    ((REGENV *)env->reginfo->primary)->lg_primary);
58 	dblp->bufp = R_ADDR(&dblp->reginfo, lp->buffer_off);
59 
60 	/*
61 	 * If the region is threaded, we have to lock the DBREG list, and we
62 	 * need to allocate a mutex for that purpose.
63 	 */
64 	if ((ret = __mutex_alloc(env,
65 	    MTX_LOG_REGION, DB_MUTEX_PROCESS_ONLY, &dblp->mtx_dbreg)) != 0)
66 		goto err;
67 
68 	/*
69 	 * Set the handle -- we may be about to run recovery, which allocates
70 	 * log cursors.  Log cursors require logging be already configured,
71 	 * and the handle being set is what demonstrates that.
72 	 *
73 	 * If we created the region, run recovery.  If that fails, make sure
74 	 * we reset the log handle before cleaning up, otherwise we will try
75 	 * and clean up again in the mainline ENV initialization code.
76 	 */
77 	env->lg_handle = dblp;
78 
79 	if (F_ISSET(&dblp->reginfo, REGION_CREATE)) {
80 		/*
81 		 * We first take the log file size from the environment, if
82 		 * specified.  If that wasn't set, default it.  Regardless,
83 		 * recovery may set it from the persistent information in a
84 		 * log file header.
85 		 */
86 		if (lp->log_size == 0)
87 			lp->log_size =
88 			    FLD_ISSET(dbenv->lg_flags, DB_LOG_IN_MEMORY) ?
89 			    LG_MAX_INMEM : LG_MAX_DEFAULT;
90 
91 		if ((ret = __log_recover(dblp)) != 0)
92 			goto err;
93 
94 		/*
95 		 * If the next log file size hasn't been set yet, default it
96 		 * to the current log file size.
97 		 */
98 		if (lp->log_nsize == 0)
99 			lp->log_nsize = lp->log_size;
100 
101 		/*
102 		 * If we haven't written any log files, write the first one
103 		 * so that checkpoint gets a valid ckp_lsn value.
104 		 */
105 		if (IS_INIT_LSN(lp->lsn) &&
106 		    (ret = __log_newfile(dblp, NULL, 0, 0)) != 0)
107 			goto err;
108 
109 		/*
110 		 * Initialize replication's next-expected LSN value
111 		 * and replication's bulk buffer.  In __env_open, we
112 		 * always create/open the replication region before
113 		 * the log region so we're assured that our rep_handle
114 		 * is valid at this point, if replication is being used.
115 		 */
116 		lp->ready_lsn = lp->lsn;
117 		if (IS_ENV_REPLICATED(env)) {
118 			if ((ret =
119 			    __env_alloc(&dblp->reginfo, MEGABYTE, &bulk)) != 0)
120 				goto err;
121 			lp->bulk_buf = R_OFFSET(&dblp->reginfo, bulk);
122 			lp->bulk_len = MEGABYTE;
123 			lp->bulk_off = 0;
124 			lp->wait_ts = env->rep_handle->request_gap;
125 			__os_gettime(env, &lp->rcvd_ts, 1);
126 		} else {
127 			lp->bulk_buf = INVALID_ROFF;
128 			lp->bulk_len = 0;
129 			lp->bulk_off = 0;
130 		}
131 	} else {
132 		/*
133 		 * A process joining the region may have reset the log file
134 		 * size, too.  If so, it only affects the next log file we
135 		 * create.  We need to check that the size is reasonable given
136 		 * the buffer size in the region.
137 		 */
138 		LOG_SYSTEM_LOCK(env);
139 		region_locked = 1;
140 
141 		 if (dbenv->lg_size != 0) {
142 			if ((ret =
143 			    __log_check_sizes(env, dbenv->lg_size, 0)) != 0)
144 				goto err;
145 
146 			lp->log_nsize = dbenv->lg_size;
147 		 }
148 
149 		LOG_SYSTEM_UNLOCK(env);
150 		region_locked = 0;
151 
152 		if (dbenv->lg_flags != 0 && (ret =
153 		    __log_set_config_int(dbenv, dbenv->lg_flags, 1, 0)) != 0)
154 			return (ret);
155 	}
156 	dblp->reginfo.mtx_alloc = lp->mtx_region;
157 
158 	return (0);
159 
160 err:	if (dblp->reginfo.addr != NULL) {
161 		if (region_locked)
162 			LOG_SYSTEM_UNLOCK(env);
163 		(void)__env_region_detach(env, &dblp->reginfo, 0);
164 	}
165 	env->lg_handle = NULL;
166 
167 	(void)__mutex_free(env, &dblp->mtx_dbreg);
168 	__os_free(env, dblp);
169 
170 	return (ret);
171 }
172 
173 /*
174  * __log_init --
175  *	Initialize a log region in shared memory.
176  */
177 static int
__log_init(env,dblp)178 __log_init(env, dblp)
179 	ENV *env;
180 	DB_LOG *dblp;
181 {
182 	DB_ENV *dbenv;
183 	LOG *lp;
184 	int ret;
185 	void *p;
186 
187 	dbenv = env->dbenv;
188 
189 	/*
190 	 * This is the first point where we can validate the buffer size,
191 	 * because we know all three settings have been configured (file size,
192 	 * buffer size and the in-memory flag).
193 	 */
194 	if ((ret =
195 	   __log_check_sizes(env, dbenv->lg_size, dbenv->lg_bsize)) != 0)
196 		return (ret);
197 
198 	if ((ret = __env_alloc(&dblp->reginfo,
199 	    sizeof(*lp), &dblp->reginfo.primary)) != 0)
200 		goto mem_err;
201 
202 	((REGENV *)env->reginfo->primary)->lg_primary =
203 	     R_OFFSET(&dblp->reginfo, dblp->reginfo.primary);
204 
205 	lp = dblp->reginfo.primary;
206 	memset(lp, 0, sizeof(*lp));
207 
208 	/* We share the region so we need the same mutex. */
209 	lp->mtx_region = ((REGENV *)env->reginfo->primary)->mtx_regenv;
210 
211 	lp->fid_max = 0;
212 	SH_TAILQ_INIT(&lp->fq);
213 	lp->free_fid_stack = INVALID_ROFF;
214 	lp->free_fids = lp->free_fids_alloced = 0;
215 
216 	/* Initialize LOG LSNs. */
217 	INIT_LSN(lp->lsn);
218 	INIT_LSN(lp->t_lsn);
219 
220 	/*
221 	 * It's possible to be waiting for an LSN of [1][0], if a replication
222 	 * client gets the first log record out of order.  An LSN of [0][0]
223 	 * signifies that we're not waiting.
224 	 */
225 	ZERO_LSN(lp->waiting_lsn);
226 
227 	/*
228 	 * Log makes note of the fact that it ran into a checkpoint on
229 	 * startup if it did so, as a recovery optimization.  A zero
230 	 * LSN signifies that it hasn't found one [yet].
231 	 */
232 	ZERO_LSN(lp->cached_ckp_lsn);
233 
234 	if ((ret =
235 	    __mutex_alloc(env, MTX_LOG_FILENAME, 0, &lp->mtx_filelist)) != 0)
236 		return (ret);
237 	if ((ret = __mutex_alloc(env, MTX_LOG_FLUSH, 0, &lp->mtx_flush)) != 0)
238 		return (ret);
239 
240 	/* Initialize the buffer. */
241 	if ((ret = __env_alloc(&dblp->reginfo, dbenv->lg_bsize, &p)) != 0) {
242 mem_err:	__db_errx( env, DB_STR("2524",
243 		    "unable to allocate log region memory"));
244 		return (ret);
245 	}
246 	lp->regionmax = dbenv->lg_regionmax;
247 	lp->buffer_off = R_OFFSET(&dblp->reginfo, p);
248 	lp->buffer_size = dbenv->lg_bsize;
249 	lp->filemode = dbenv->lg_filemode;
250 	lp->log_size = lp->log_nsize = dbenv->lg_size;
251 	lp->stat.st_fileid_init = dbenv->lg_fileid_init;
252 
253 	/* Initialize the commit Queue. */
254 	SH_TAILQ_INIT(&lp->free_commits);
255 	SH_TAILQ_INIT(&lp->commits);
256 	lp->ncommit = 0;
257 
258 	/* Initialize the logfiles list for in-memory logs. */
259 	SH_TAILQ_INIT(&lp->logfiles);
260 	SH_TAILQ_INIT(&lp->free_logfiles);
261 
262 	/*
263 	 * Fill in the log's persistent header.  Don't fill in the log file
264 	 * sizes, as they may change at any time and so have to be filled in
265 	 * as each log file is created.
266 	 */
267 	lp->persist.magic = DB_LOGMAGIC;
268 	/*
269 	 * Don't use __log_set_version because env->dblp isn't set up yet.
270 	 */
271 	lp->persist.version = DB_LOGVERSION;
272 	lp->persist.notused = 0;
273 	env->lg_handle = dblp;
274 
275 	/* Migrate persistent flags from the ENV into the region. */
276 	if (dbenv->lg_flags != 0 &&
277 	    (ret = __log_set_config_int(dbenv, dbenv->lg_flags, 1, 1)) != 0)
278 		return (ret);
279 
280 	(void)time(&lp->timestamp);
281 	return (0);
282 }
283 
284 /*
285  * __log_recover --
286  *	Recover a log.
287  */
288 static int
__log_recover(dblp)289 __log_recover(dblp)
290 	DB_LOG *dblp;
291 {
292 	DBT dbt;
293 	DB_ENV *dbenv;
294 	DB_LOGC *logc;
295 	DB_LSN lsn;
296 	ENV *env;
297 	LOG *lp;
298 	u_int32_t cnt, rectype;
299 	int ret;
300 	logfile_validity status;
301 
302 	env = dblp->env;
303 	dbenv = env->dbenv;
304 	logc = NULL;
305 	lp = dblp->reginfo.primary;
306 
307 	/*
308 	 * Find a log file.  If none exist, we simply return, leaving
309 	 * everything initialized to a new log.
310 	 */
311 	if ((ret = __log_find(dblp, 0, &cnt, &status)) != 0)
312 		return (ret);
313 	if (cnt == 0) {
314 		if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
315 			__db_msg(env, DB_STR("2525", "No log files found"));
316 		return (0);
317 	}
318 
319 	/*
320 	 * If the last file is an old, unreadable version, start a new
321 	 * file.  Don't bother finding the end of the last log file;
322 	 * we assume that it's valid in its entirety, since the user
323 	 * should have shut down cleanly or run recovery before upgrading.
324 	 */
325 	if (status == DB_LV_OLD_UNREADABLE) {
326 		lp->lsn.file = lp->s_lsn.file = cnt + 1;
327 		lp->lsn.offset = lp->s_lsn.offset = 0;
328 		goto skipsearch;
329 	}
330 	DB_ASSERT(env,
331 	    (status == DB_LV_NORMAL || status == DB_LV_OLD_READABLE));
332 
333 	/*
334 	 * We have the last useful log file and we've loaded any persistent
335 	 * information.  Set the end point of the log past the end of the last
336 	 * file. Read the last file, looking for the last checkpoint and
337 	 * the log's end.
338 	 */
339 	lp->lsn.file = cnt + 1;
340 	lp->lsn.offset = 0;
341 	lsn.file = cnt;
342 	lsn.offset = 0;
343 
344 	/*
345 	 * Allocate a cursor and set it to the first record.  This shouldn't
346 	 * fail, leave error messages on.
347 	 */
348 	if ((ret = __log_cursor(env, &logc)) != 0)
349 		return (ret);
350 	F_SET(logc, DB_LOG_LOCKED);
351 	memset(&dbt, 0, sizeof(dbt));
352 	if ((ret = __logc_get(logc, &lsn, &dbt, DB_SET)) != 0)
353 		goto err;
354 
355 	/*
356 	 * Read to the end of the file.  This may fail at some point, so
357 	 * turn off error messages.
358 	 */
359 	F_SET(logc, DB_LOG_SILENT_ERR);
360 	while (__logc_get(logc, &lsn, &dbt, DB_NEXT) == 0) {
361 		if (dbt.size < sizeof(u_int32_t))
362 			continue;
363 		LOGCOPY_32(env, &rectype, dbt.data);
364 		if (rectype == DB___txn_ckp)
365 			/*
366 			 * If we happen to run into a checkpoint, cache its
367 			 * LSN so that the transaction system doesn't have
368 			 * to walk this log file again looking for it.
369 			 */
370 			lp->cached_ckp_lsn = lsn;
371 	}
372 	F_CLR(logc, DB_LOG_SILENT_ERR);
373 
374 	/*
375 	 * We now know where the end of the log is.  Set the first LSN that
376 	 * we want to return to an application and the LSN of the last known
377 	 * record on disk.
378 	 */
379 	lp->lsn = lsn;
380 	lp->s_lsn = lsn;
381 	lp->lsn.offset += logc->len;
382 	lp->s_lsn.offset += logc->len;
383 
384 	/* Set up the current buffer information, too. */
385 	lp->len = logc->len;
386 	lp->a_off = 0;
387 	lp->b_off = 0;
388 	lp->w_off = lp->lsn.offset;
389 
390 skipsearch:
391 	if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
392 		__db_msg(env, DB_STR_A("2526",
393 		    "Finding last valid log LSN: file: %lu offset %lu",
394 		    "%lu %lu"), (u_long)lp->lsn.file, (u_long)lp->lsn.offset);
395 
396 err:	if (logc != NULL)
397 		(void)__logc_close(logc);
398 
399 	return (ret);
400 }
401 
402 /*
403  * __log_find --
404  *	Try to find a log file.  If find_first is set, valp will contain
405  * the number of the first readable log file, else it will contain the number
406  * of the last log file (which may be too old to read).
407  *
408  * PUBLIC: int __log_find __P((DB_LOG *, int, u_int32_t *, logfile_validity *));
409  */
410 int
__log_find(dblp,find_first,valp,statusp)411 __log_find(dblp, find_first, valp, statusp)
412 	DB_LOG *dblp;
413 	int find_first;
414 	u_int32_t *valp;
415 	logfile_validity *statusp;
416 {
417 	ENV *env;
418 	LOG *lp;
419 	logfile_validity logval_status, status;
420 	struct __db_filestart *filestart;
421 	u_int32_t clv, logval;
422 	int cnt, fcnt, ret;
423 	const char *dir;
424 	char *c, **names, *p, *q;
425 
426 	env = dblp->env;
427 	lp = dblp->reginfo.primary;
428 	logval_status = status = DB_LV_NONEXISTENT;
429 
430 	/* Return a value of 0 as the log file number on failure. */
431 	*valp = 0;
432 
433 	if (lp->db_log_inmemory) {
434 		filestart = find_first ?
435 		    SH_TAILQ_FIRST(&lp->logfiles, __db_filestart) :
436 		    SH_TAILQ_LAST(&lp->logfiles, links, __db_filestart);
437 		if (filestart != NULL) {
438 			*valp = filestart->file;
439 			logval_status = DB_LV_NORMAL;
440 		}
441 		*statusp = logval_status;
442 		return (0);
443 	}
444 
445 	/* Find the directory name. */
446 	if ((ret = __log_name(dblp, 1, &p, NULL, 0)) != 0) {
447 		__os_free(env, p);
448 		return (ret);
449 	}
450 	if ((q = __db_rpath(p)) == NULL)
451 		dir = PATH_DOT;
452 	else {
453 		*q = '\0';
454 		dir = p;
455 	}
456 
457 	/* Get the list of file names. */
458 retry:	if ((ret = __os_dirlist(env, dir, 0, &names, &fcnt)) != 0) {
459 		__db_err(env, ret, "%s", dir);
460 		__os_free(env, p);
461 		return (ret);
462 	}
463 
464 	/* Search for a valid log file name. */
465 	for (cnt = fcnt, clv = logval = 0; --cnt >= 0;) {
466 		if (!IS_LOG_FILE(names[cnt]))
467 			continue;
468 
469 		/*
470 		 * Names of the form log\.[0-9]* are reserved for DB.  Other
471 		 * names sharing LFPREFIX, such as "log.db", are legal.
472 		 */
473 		for (c = names[cnt] + sizeof(LFPREFIX) - 1; *c != '\0'; c++)
474 			if (!isdigit((int)*c))
475 				break;
476 		if (*c != '\0')
477 			continue;
478 
479 		/*
480 		 * Use atol, not atoi; if an "int" is 16-bits, the largest
481 		 * log file name won't fit.
482 		 */
483 		clv = (u_int32_t)atol(names[cnt] + (sizeof(LFPREFIX) - 1));
484 
485 		/*
486 		 * If searching for the first log file, we want to return the
487 		 * oldest log file we can read, or, if no readable log files
488 		 * exist, the newest log file we can't read (the crossover
489 		 * point between the old and new versions of the log file).
490 		 *
491 		 * If we're searching for the last log file, we want to return
492 		 * the newest log file, period.
493 		 *
494 		 * Readable log files should never precede unreadable log
495 		 * files, that would mean the admin seriously screwed up.
496 		 */
497 		if (find_first) {
498 			if (logval != 0 &&
499 			    status != DB_LV_OLD_UNREADABLE && clv > logval)
500 				continue;
501 		} else
502 			if (logval != 0 && clv < logval)
503 				continue;
504 
505 		if ((ret = __log_valid(dblp, clv, 1, NULL, 0,
506 		    &status, NULL)) != 0) {
507 			/*
508 			 * If we have raced with removal of a log file since
509 			 * the call to __os_dirlist, it may no longer exist.
510 			 * In that case, just go on to the next one.  If we're
511 			 * at the end of the list, all of the log files we saw
512 			 * initially are gone and we need to get the list again.
513 			 */
514 			if (ret == ENOENT) {
515 				ret = 0;
516 				if (cnt == 0) {
517 					__os_dirfree(env, names, fcnt);
518 					goto retry;
519 				}
520 				continue;
521 			}
522 			__db_err(env, ret, DB_STR_A("2527",
523 			    "Invalid log file: %s", "%s"), names[cnt]);
524 			goto err;
525 		}
526 		switch (status) {
527 		case DB_LV_NONEXISTENT:
528 			/* __log_valid never returns DB_LV_NONEXISTENT. */
529 			DB_ASSERT(env, 0);
530 			break;
531 		case DB_LV_INCOMPLETE:
532 			/*
533 			 * The last log file may not have been initialized --
534 			 * it's possible to create a log file but not write
535 			 * anything to it.  If performing recovery (that is,
536 			 * if find_first isn't set), ignore the file, it's
537 			 * not interesting.  If we're searching for the first
538 			 * log record, return the file (assuming we don't find
539 			 * something better), as the "real" first log record
540 			 * is likely to be in the log buffer, and we want to
541 			 * set the file LSN for our return.
542 			 */
543 			if (find_first)
544 				goto found;
545 			break;
546 		case DB_LV_OLD_UNREADABLE:
547 			/*
548 			 * If we're searching for the first log file, then we
549 			 * only want this file if we don't yet have a file or
550 			 * already have an unreadable file and this one is
551 			 * newer than that one.  If we're searching for the
552 			 * last log file, we always want this file because we
553 			 * wouldn't be here if it wasn't newer than our current
554 			 * choice.
555 			 */
556 			if (!find_first || logval == 0 ||
557 			    (status == DB_LV_OLD_UNREADABLE && clv > logval))
558 				goto found;
559 			break;
560 		case DB_LV_NORMAL:
561 		case DB_LV_OLD_READABLE:
562 found:			logval = clv;
563 			logval_status = status;
564 			break;
565 		}
566 	}
567 
568 	*valp = logval;
569 
570 err:	__os_dirfree(env, names, fcnt);
571 	__os_free(env, p);
572 	*statusp = logval_status;
573 
574 	return (ret);
575 }
576 
577 /*
578  * log_valid --
579  *	Validate a log file.  Returns an error code in the event of
580  *	a fatal flaw in a the specified log file;  returns success with
581  *	a code indicating the currentness and completeness of the specified
582  *	log file if it is not unexpectedly flawed (that is, if it's perfectly
583  *	normal, if it's zero-length, or if it's an old version).
584  *
585  * PUBLIC: int __log_valid __P((DB_LOG *, u_int32_t, int,
586  * PUBLIC:     DB_FH **, u_int32_t, logfile_validity *, u_int32_t *));
587  */
588 int
__log_valid(dblp,number,set_persist,fhpp,flags,statusp,versionp)589 __log_valid(dblp, number, set_persist, fhpp, flags, statusp, versionp)
590 	DB_LOG *dblp;
591 	u_int32_t number;
592 	int set_persist;
593 	DB_FH **fhpp;
594 	u_int32_t flags;
595 	logfile_validity *statusp;
596 	u_int32_t *versionp;
597 {
598 	DB_CIPHER *db_cipher;
599 	DB_FH *fhp;
600 	ENV *env;
601 	HDR *hdr;
602 	LOG *lp;
603 	LOGP *persist;
604 	logfile_validity status;
605 	size_t hdrsize, nr, recsize;
606 	int chksum_includes_hdr, is_hmac, ret;
607 	u_int32_t logversion;
608 	u_int8_t *tmp;
609 	char *fname;
610 
611 	env = dblp->env;
612 	db_cipher = env->crypto_handle;
613 	fhp = NULL;
614 	persist = NULL;
615 	status = DB_LV_NORMAL;
616 	tmp = NULL;
617 #if defined(HAVE_LOG_CHECKSUM)
618 	/* Most log versions include the hdr in the checksum. */
619 	chksum_includes_hdr = 1;
620 #else
621 	COMPQUIET(chksum_includes_hdr, 0);
622 #endif
623 
624 	/* Return the file handle to our caller, on request */
625 	if (fhpp != NULL)
626 		*fhpp = NULL;
627 
628 	if (flags == 0)
629 		flags = DB_OSO_RDONLY | DB_OSO_SEQ;
630 	/* Try to open the log file. */
631 	if ((ret = __log_name(dblp, number, &fname, &fhp, flags)) != 0) {
632 		__os_free(env, fname);
633 		return (ret);
634 	}
635 
636 	hdrsize = HDR_NORMAL_SZ;
637 	is_hmac = 0;
638 	recsize = sizeof(LOGP);
639 	if (CRYPTO_ON(env)) {
640 		hdrsize = HDR_CRYPTO_SZ;
641 		recsize += db_cipher->adj_size(recsize);
642 		is_hmac = 1;
643 	}
644 	if ((ret = __os_calloc(env, 1, recsize + hdrsize, &tmp)) != 0)
645 		goto err;
646 
647 	hdr = (HDR *)tmp;
648 	persist = (LOGP *)(tmp + hdrsize);
649 
650 	/*
651 	 * Try to read the header.  This can fail if the log is truncated, or
652 	 * if we find a preallocated log file where the header has not yet been
653 	 * written, so we need to check whether the header is zero-filled.
654 	 */
655 	if ((ret = __os_read(env, fhp, tmp, recsize + hdrsize, &nr)) != 0 ||
656 	    nr != recsize + hdrsize ||
657 	    (hdr->len == 0 && persist->magic == 0 && persist->log_size == 0)) {
658 		if (ret == 0)
659 			status = DB_LV_INCOMPLETE;
660 		else
661 			/*
662 			 * The error was a fatal read error, not just an
663 			 * incompletely initialized log file.
664 			 */
665 			__db_err(env, ret, DB_STR_A("2528",
666 			    "ignoring log file: %s", "%s"), fname);
667 		goto err;
668 	}
669 
670 	if (LOG_SWAPPED(env))
671 		__log_hdrswap(hdr, CRYPTO_ON(env));
672 
673 	/*
674 	 * Now we have to validate the persistent record.  We have
675 	 * several scenarios we have to deal with:
676 	 *
677 	 * 1.  User has crypto turned on:
678 	 *	- They're reading an old, unencrypted log file
679 	 *	  .  We will fail the record size match check below.
680 	 *	- They're reading a current, unencrypted log file
681 	 *	  .  We will fail the record size match check below.
682 	 *	- They're reading an old, encrypted log file [NOT YET]
683 	 *	  .  After decryption we'll fail the version check.  [NOT YET]
684 	 *	- They're reading a current, encrypted log file
685 	 *	  .  We should proceed as usual.
686 	 * 2.  User has crypto turned off:
687 	 *	- They're reading an old, unencrypted log file
688 	 *	  .  We will fail the version check.
689 	 *	- They're reading a current, unencrypted log file
690 	 *	  .  We should proceed as usual.
691 	 *	- They're reading an old, encrypted log file [NOT YET]
692 	 *	  .  We'll fail the magic number check (it is encrypted).
693 	 *	- They're reading a current, encrypted log file
694 	 *	  .  We'll fail the magic number check (it is encrypted).
695 	 */
696 	if (CRYPTO_ON(env)) {
697 		/*
698 		 * If we are trying to decrypt an unencrypted log
699 		 * we can only detect that by having an unreasonable
700 		 * data length for our persistent data.
701 		 */
702 		if ((hdr->len - hdrsize) != recsize) {
703 			__db_errx(env, "log record size mismatch");
704 			goto err;
705 		}
706 		/*
707 		 * The checksum is calculated from the encrypted data, and,
708 		 * for recent logs, the fields hdr->{prev,len}.
709 		 */
710 #ifdef HAVE_LOG_CHECKSUM
711 		if ((ret = __db_check_chksum(env, hdr, db_cipher,
712 		    &hdr->chksum[0], (u_int8_t *)persist,
713 		    hdr->len - hdrsize, is_hmac)) != 0) {
714 			/*
715 			 * The checksum doesn't verify when the header fields
716 			 * are included; try without the header.
717 			 */
718 
719 			if ((ret = __db_check_chksum(env, NULL, db_cipher,
720 			    &hdr->chksum[0], (u_int8_t *)persist,
721 			    hdr->len - hdrsize, is_hmac)) != 0)
722 				goto bad_checksum;
723 			/*
724  			 * The checksum verifies without the header.  Make note
725  			 * of that, because it is only acceptable when the log
726  			 * version < DB_LOGCHKSUM.  Later, when we determine log
727  			 * version, we will confirm this.
728 			 */
729 			chksum_includes_hdr = 0;
730 		}
731 #endif
732 
733 		if ((ret = db_cipher->decrypt(env, db_cipher->data,
734 		    &hdr->iv[0], (u_int8_t *)persist, hdr->len - hdrsize)) != 0)
735 			goto err;
736 	}
737 
738 	/* Swap the header, if necessary. */
739 	if (LOG_SWAPPED(env)) {
740 		/*
741 		 * If the magic number is not byte-swapped, we're looking at an
742 		 * old log that we can no longer read.
743 		 */
744 		if (persist->magic == DB_LOGMAGIC) {
745 			__db_errx(env, DB_STR_A("2529",
746 			    "Ignoring log file: %s historic byte order",
747 			    "%s"), fname);
748 			status = DB_LV_OLD_UNREADABLE;
749 			goto err;
750 		}
751 
752 		__log_persistswap(persist);
753 	}
754 
755 	/* Validate the header. */
756 	if (persist->magic != DB_LOGMAGIC) {
757 		__db_errx(env, DB_STR_A("2530",
758 		    "Ignoring log file: %s: magic number %lx, not %lx",
759 		    "%s %lx %lx"), fname,
760 		    (u_long)persist->magic, (u_long)DB_LOGMAGIC);
761 		ret = EINVAL;
762 		goto err;
763 	}
764 
765 	logversion = persist->version;
766 	/*
767 	 * Set our status code to indicate whether the log file belongs to an
768 	 * unreadable or readable old version; leave it alone if and only if
769 	 * the log file version is the current one.
770 	 */
771 	if (logversion > DB_LOGVERSION) {
772 		/* This is a fatal error--the log file is newer than DB. */
773 		__db_errx(env, DB_STR_A("2531",
774 		    "Unacceptable log file %s: unsupported log version %lu",
775 		    "%s %lu"), fname, (u_long)logversion);
776 		ret = EINVAL;
777 		goto err;
778 	} else if (logversion < DB_LOGOLDVER) {
779 		status = DB_LV_OLD_UNREADABLE;
780 		/* This is a non-fatal error, but give some feedback. */
781 		__db_errx(env, DB_STR_A("2532",
782 		    "Skipping log file %s: historic log version %lu", "%s %lu"),
783 		    fname, (u_long)logversion);
784 		/*
785 		 * We don't want to set persistent info based on an unreadable
786 		 * region, so jump to "err".
787 		 */
788 		goto err;
789 	} else if (logversion < DB_LOGVERSION)
790 		status = DB_LV_OLD_READABLE;
791 
792 	/*
793 	 * We could not check the checksum before checking the magic and version
794 	 * because old log headers put the length and checksum in a different
795 	 * location.
796 	 */
797 #ifdef HAVE_LOG_CHECKSUM
798 	if (CRYPTO_ON(env)) {
799 		/*
800 		 * We might have to declare a checksum failure here, if:
801 		 * - the checksum verified only by ignoring the header, and
802 		 * - the log version indicates that the header should have
803 		 * been included.
804 		 */
805 		if (!chksum_includes_hdr && logversion >= DB_LOGCHKSUM)
806 			goto bad_checksum;
807 	} else {
808 		/*
809 		 * The checksum was calculated with the swapped byte order. We
810 		 * might need to swap them back; the check needs the same bytes.
811 		 */
812 		if (LOG_SWAPPED(env))
813 			__log_persistswap(persist);
814 		/*
815 		 * We have the logversion here, so we know whether to include
816 		 * the hdr or not.
817 		 */
818 		if ((ret = __db_check_chksum(env,
819 		    logversion >= DB_LOGCHKSUM ? hdr : NULL, db_cipher,
820 		    &hdr->chksum[0], (u_int8_t *)persist,
821 		    hdr->len - hdrsize, is_hmac)) != 0) {
822 bad_checksum:
823 			__db_errx(env, DB_STR("2533",
824 			    "log record checksum mismatch"));
825 			goto err;
826 		}
827 
828 		if (LOG_SWAPPED(env))
829 			__log_persistswap(persist);
830 	}
831 #endif
832 
833 	/*
834 	 * If the log is readable so far and we're doing system initialization,
835 	 * set the region's persistent information based on the headers.
836 	 *
837 	 * Override the current log file size.
838 	 */
839 	if (set_persist) {
840 		lp = dblp->reginfo.primary;
841 		lp->log_size = persist->log_size;
842 		lp->persist.version = logversion;
843 	}
844 	if (versionp != NULL)
845 		*versionp = logversion;
846 
847 err:	if (fname != NULL)
848 		__os_free(env, fname);
849 	if (ret == 0 && fhpp != NULL)
850 		*fhpp = fhp;
851 	else
852 		/* Must close on error or if we only used it locally. */
853 		(void)__os_closehandle(env, fhp);
854 	if (tmp != NULL)
855 		__os_free(env, tmp);
856 
857 	if (statusp != NULL)
858 		*statusp = status;
859 
860 	return (ret);
861 }
862 
863 /*
864  * __log_env_refresh --
865  *	Clean up after the log system on a close or failed open.
866  *
867  * PUBLIC: int __log_env_refresh __P((ENV *));
868  */
869 int
__log_env_refresh(env)870 __log_env_refresh(env)
871 	ENV *env;
872 {
873 	DB_LOG *dblp;
874 	LOG *lp;
875 	REGINFO *reginfo;
876 	struct __fname *fnp;
877 	struct __db_commit *commit;
878 	struct __db_filestart *filestart;
879 	int ret, t_ret;
880 
881 	dblp = env->lg_handle;
882 	reginfo = &dblp->reginfo;
883 	lp = reginfo->primary;
884 	ret = 0;
885 
886 	/*
887 	 * Flush the log if it's private -- there's no Berkeley DB guarantee
888 	 * that this gets done, but in case the application has forgotten to
889 	 * flush for durability, it's the polite thing to do.
890 	 */
891 	if (F_ISSET(env, ENV_PRIVATE) &&
892 	    (t_ret = __log_flush(env, NULL)) != 0 && ret == 0)
893 		ret = t_ret;
894 
895 	if ((t_ret = __dbreg_close_files(env, 0)) != 0 && ret == 0)
896 		ret = t_ret;
897 
898 	/*
899 	 * After we close the files, check for any unlogged closes left in
900 	 * the shared memory queue.  If we find any, try to log it, otherwise
901 	 * return the error.  We cannot say the environment was closed
902 	 * cleanly.
903 	 */
904 	MUTEX_LOCK(env, lp->mtx_filelist);
905 	SH_TAILQ_FOREACH(fnp, &lp->fq, q, __fname)
906 		if (F_ISSET(fnp, DB_FNAME_NOTLOGGED) &&
907 		    (t_ret = __dbreg_close_id_int(
908 		    env, fnp, DBREG_CLOSE, 1)) != 0)
909 			ret = t_ret;
910 	MUTEX_UNLOCK(env, lp->mtx_filelist);
911 
912 	/*
913 	 * If a private region, return the memory to the heap.  Not needed for
914 	 * filesystem-backed or system shared memory regions, that memory isn't
915 	 * owned by any particular process.
916 	 */
917 	if (F_ISSET(env, ENV_PRIVATE)) {
918 		reginfo->mtx_alloc = MUTEX_INVALID;
919 		/* Discard the flush mutex. */
920 		if ((t_ret =
921 		    __mutex_free(env, &lp->mtx_flush)) != 0 && ret == 0)
922 			ret = t_ret;
923 
924 		/* Discard the buffer. */
925 		__env_alloc_free(reginfo, R_ADDR(reginfo, lp->buffer_off));
926 
927 		/* Discard stack of free file IDs. */
928 		if (lp->free_fid_stack != INVALID_ROFF)
929 			__env_alloc_free(reginfo,
930 			    R_ADDR(reginfo, lp->free_fid_stack));
931 
932 		/* Discard the list of in-memory log file markers. */
933 		while ((filestart = SH_TAILQ_FIRST(&lp->logfiles,
934 		    __db_filestart)) != NULL) {
935 			SH_TAILQ_REMOVE(&lp->logfiles, filestart, links,
936 			    __db_filestart);
937 			__env_alloc_free(reginfo, filestart);
938 		}
939 
940 		while ((filestart = SH_TAILQ_FIRST(&lp->free_logfiles,
941 		    __db_filestart)) != NULL) {
942 			SH_TAILQ_REMOVE(&lp->free_logfiles, filestart, links,
943 			    __db_filestart);
944 			__env_alloc_free(reginfo, filestart);
945 		}
946 
947 		/* Discard commit queue elements. */
948 		while ((commit = SH_TAILQ_FIRST(&lp->free_commits,
949 		    __db_commit)) != NULL) {
950 			SH_TAILQ_REMOVE(&lp->free_commits, commit, links,
951 			    __db_commit);
952 			__env_alloc_free(reginfo, commit);
953 		}
954 
955 		/* Discard replication bulk buffer. */
956 		if (lp->bulk_buf != INVALID_ROFF) {
957 			__env_alloc_free(reginfo,
958 			    R_ADDR(reginfo, lp->bulk_buf));
959 			lp->bulk_buf = INVALID_ROFF;
960 		}
961 	}
962 
963 	/* Discard the per-thread DBREG mutex. */
964 	if ((t_ret = __mutex_free(env, &dblp->mtx_dbreg)) != 0 && ret == 0)
965 		ret = t_ret;
966 
967 	/* Detach from the region. */
968 	if ((t_ret = __env_region_detach(env, reginfo, 0)) != 0 && ret == 0)
969 		ret = t_ret;
970 
971 	/* Close open files, release allocated memory. */
972 	if (dblp->lfhp != NULL) {
973 		if ((t_ret =
974 		    __os_closehandle(env, dblp->lfhp)) != 0 && ret == 0)
975 			ret = t_ret;
976 		dblp->lfhp = NULL;
977 	}
978 	if (dblp->dbentry != NULL)
979 		__os_free(env, dblp->dbentry);
980 
981 	__os_free(env, dblp);
982 
983 	env->lg_handle = NULL;
984 	return (ret);
985 }
986 
987 /*
988  * __log_get_cached_ckp_lsn --
989  *	Retrieve any last checkpoint LSN that we may have found on startup.
990  *
991  * PUBLIC: int __log_get_cached_ckp_lsn __P((ENV *, DB_LSN *));
992  */
993 int
__log_get_cached_ckp_lsn(env,ckp_lsnp)994 __log_get_cached_ckp_lsn(env, ckp_lsnp)
995 	ENV *env;
996 	DB_LSN *ckp_lsnp;
997 {
998 	DB_LOG *dblp;
999 	LOG *lp;
1000 
1001 	dblp = env->lg_handle;
1002 	lp = (LOG *)dblp->reginfo.primary;
1003 
1004 	LOG_SYSTEM_LOCK(env);
1005 	*ckp_lsnp = lp->cached_ckp_lsn;
1006 	LOG_SYSTEM_UNLOCK(env);
1007 
1008 	return (0);
1009 }
1010 
1011 /*
1012  * __log_region_mutex_count --
1013  *	Return the number of mutexes the log region will need.
1014  *
1015  * PUBLIC: u_int32_t __log_region_mutex_count __P((ENV *));
1016  */
1017 u_int32_t
__log_region_mutex_count(env)1018 __log_region_mutex_count(env)
1019 	ENV *env;
1020 {
1021 	/*
1022 	 * We need a few assorted mutexes, and one per transaction waiting
1023 	 * on the group commit list.  We can't know how many that will be,
1024 	 * but it should be bounded by the maximum active transactions.
1025 	 */
1026 	return (env->dbenv->tx_init + 5);
1027 }
1028 
1029 /*
1030  * __log_region_mutex_max --
1031  *	Return the number of additional mutexes the log region will need.
1032  *
1033  * PUBLIC: u_int32_t __log_region_mutex_max __P((ENV *));
1034  */
1035 u_int32_t
__log_region_mutex_max(env)1036 __log_region_mutex_max(env)
1037 	ENV *env;
1038 {
1039 	DB_ENV *dbenv;
1040 	u_int32_t count;
1041 
1042 	dbenv = env->dbenv;
1043 
1044 	if ((count = dbenv->tx_max) == 0)
1045 		count = DEF_MAX_TXNS;
1046 	if (count < dbenv->tx_init)
1047 		return (0);
1048 	return (count - dbenv->tx_init);
1049 }
1050 
1051 /*
1052  * __log_region_size --
1053  *	Return the amount of space needed for the log region.
1054  *	Make the region large enough to hold txn_max transaction
1055  *	detail structures  plus some space to hold thread handles
1056  *	and the beginning of the alloc region and anything we
1057  *	need for mutex system resource recording.
1058  * PUBLIC: size_t	__log_region_size __P((ENV *));
1059  */
1060 size_t
__log_region_size(env)1061 __log_region_size(env)
1062 	ENV *env;
1063 {
1064 	DB_ENV *dbenv;
1065 	size_t s;
1066 
1067 	dbenv = env->dbenv;
1068 
1069 	/* Set the default buffer size, if not otherwise configured. */
1070 	if (dbenv->lg_bsize == 0)
1071 		dbenv->lg_bsize = FLD_ISSET(dbenv->lg_flags, DB_LOG_IN_MEMORY) ?
1072 		    LG_BSIZE_INMEM : LG_BSIZE_DEFAULT;
1073 
1074 	s = dbenv->lg_bsize;
1075 	/* Allocate the initial fileid allocation, plus some path name space. */
1076 	s += dbenv->lg_fileid_init * __env_alloc_size((sizeof(FNAME)) + 16);
1077 
1078 	return (s);
1079 }
1080 /*
1081  * __log_region_max --
1082  *	Return the amount of extra memory to allocate for logging informaition.
1083  * PUBLIC: size_t	__log_region_max __P((ENV *));
1084  */
1085 size_t
__log_region_max(env)1086 __log_region_max(env)
1087 	ENV *env;
1088 {
1089 
1090 	DB_ENV *dbenv;
1091 	size_t s;
1092 
1093 	dbenv = env->dbenv;
1094 	if (dbenv->lg_fileid_init == 0) {
1095 		if ((s = dbenv->lg_regionmax) == 0)
1096 			s = LG_BASE_REGION_SIZE;
1097 	} else if ((s = dbenv->lg_regionmax) != 0 &&
1098 	     s < dbenv->lg_fileid_init * (__env_alloc_size(sizeof(FNAME)) + 16))
1099 		s = 0;
1100 	else if (s != 0)
1101 		s -= dbenv->lg_fileid_init *
1102 		     (__env_alloc_size(sizeof(FNAME)) + 16);
1103 
1104 	return (s);
1105 }
1106 
1107 /*
1108  * __log_vtruncate
1109  *	This is a virtual truncate.  We set up the log indicators to
1110  * make everyone believe that the given record is the last one in the
1111  * log.  Returns with the next valid LSN (i.e., the LSN of the next
1112  * record to be written). This is used in replication to discard records
1113  * in the log file that do not agree with the master.
1114  *
1115  * PUBLIC: int __log_vtruncate __P((ENV *, DB_LSN *, DB_LSN *, DB_LSN *));
1116  */
1117 int
__log_vtruncate(env,lsn,ckplsn,trunclsn)1118 __log_vtruncate(env, lsn, ckplsn, trunclsn)
1119 	ENV *env;
1120 	DB_LSN *lsn, *ckplsn, *trunclsn;
1121 {
1122 	DBT log_dbt;
1123 	DB_LOG *dblp;
1124 	DB_LOGC *logc;
1125 	LOG *lp;
1126 	u_int32_t bytes, len;
1127 	size_t offset;
1128 	int ret, t_ret;
1129 
1130 	/* Need to find out the length of this soon-to-be-last record. */
1131 	if ((ret = __log_cursor(env, &logc)) != 0)
1132 		return (ret);
1133 	memset(&log_dbt, 0, sizeof(log_dbt));
1134 	ret = __logc_get(logc, lsn, &log_dbt, DB_SET);
1135 	len = logc->len;
1136 	if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
1137 		ret = t_ret;
1138 	if (ret != 0)
1139 		return (ret);
1140 
1141 	/* Now do the truncate. */
1142 	dblp = env->lg_handle;
1143 	lp = (LOG *)dblp->reginfo.primary;
1144 
1145 	LOG_SYSTEM_LOCK(env);
1146 
1147 	/*
1148 	 * Flush the log so we can simply initialize the in-memory buffer
1149 	 * after the truncate.
1150 	 */
1151 	if ((ret = __log_flush_int(dblp, NULL, 0)) != 0)
1152 		goto err;
1153 
1154 	lp->lsn = *lsn;
1155 	lp->len = len;
1156 	lp->lsn.offset += lp->len;
1157 
1158 	offset = lp->b_off;
1159 	if (lp->db_log_inmemory && (ret =
1160 	    __log_inmem_lsnoff(dblp, &lp->lsn, &offset)) != 0) {
1161 			lp->b_off = (db_size_t)offset;
1162 			goto err;
1163 	}
1164 	lp->b_off = (db_size_t)offset;
1165 
1166 	/*
1167 	 * I am going to assume that the number of bytes written since
1168 	 * the last checkpoint doesn't exceed a 32-bit number.
1169 	 */
1170 	DB_ASSERT(env, lp->lsn.file >= ckplsn->file);
1171 	bytes = 0;
1172 	if (ckplsn->file != lp->lsn.file) {
1173 		bytes = lp->log_size - ckplsn->offset;
1174 		if (lp->lsn.file > ckplsn->file + 1)
1175 			bytes += lp->log_size *
1176 			    ((lp->lsn.file - ckplsn->file) - 1);
1177 		bytes += lp->lsn.offset;
1178 	} else
1179 		bytes = lp->lsn.offset - ckplsn->offset;
1180 
1181 	lp->stat.st_wc_mbytes += bytes / MEGABYTE;
1182 	lp->stat.st_wc_bytes += bytes % MEGABYTE;
1183 
1184 	/*
1185 	 * If the synced lsn is greater than our new end of log, reset it
1186 	 * to our current end of log.
1187 	 */
1188 	MUTEX_LOCK(env, lp->mtx_flush);
1189 	if (LOG_COMPARE(&lp->s_lsn, lsn) > 0)
1190 		lp->s_lsn = lp->lsn;
1191 	MUTEX_UNLOCK(env, lp->mtx_flush);
1192 
1193 	/* Initialize the in-region buffer to a pristine state. */
1194 	ZERO_LSN(lp->f_lsn);
1195 	lp->w_off = lp->lsn.offset;
1196 
1197 	if (trunclsn != NULL)
1198 		*trunclsn = lp->lsn;
1199 
1200 	/* Truncate the log to the new point. */
1201 	if ((ret = __log_zero(env, &lp->lsn)) != 0)
1202 		goto err;
1203 
1204 err:	LOG_SYSTEM_UNLOCK(env);
1205 	return (ret);
1206 }
1207 
1208 /*
1209  * __log_is_outdated --
1210  *	Used by the replication system to identify if a client's logs are too
1211  *	old.
1212  *
1213  * PUBLIC: int __log_is_outdated __P((ENV *, u_int32_t, int *));
1214  */
1215 int
__log_is_outdated(env,fnum,outdatedp)1216 __log_is_outdated(env, fnum, outdatedp)
1217 	ENV *env;
1218 	u_int32_t fnum;
1219 	int *outdatedp;
1220 {
1221 	DB_LOG *dblp;
1222 	LOG *lp;
1223 	char *name;
1224 	int ret;
1225 	u_int32_t cfile;
1226 	struct __db_filestart *filestart;
1227 
1228 	dblp = env->lg_handle;
1229 
1230 	/*
1231 	 * The log represented by env is compared to the file number passed
1232 	 * in fnum.  If the log file fnum does not exist and is lower-numbered
1233 	 * than the current logs, return *outdatedp non-zero, else we return 0.
1234 	 */
1235 	if (FLD_ISSET(env->dbenv->lg_flags, DB_LOG_IN_MEMORY)) {
1236 		LOG_SYSTEM_LOCK(env);
1237 		lp = (LOG *)dblp->reginfo.primary;
1238 		filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1239 		*outdatedp = filestart == NULL ? 0 : (fnum < filestart->file);
1240 		LOG_SYSTEM_UNLOCK(env);
1241 		return (0);
1242 	}
1243 
1244 	*outdatedp = 0;
1245 	if ((ret = __log_name(dblp, fnum, &name, NULL, 0)) != 0) {
1246 		__os_free(env, name);
1247 		return (ret);
1248 	}
1249 
1250 	/* If the file exists, we're just fine. */
1251 	if (__os_exists(env, name, NULL) == 0)
1252 		goto out;
1253 
1254 	/*
1255 	 * It didn't exist, decide if the file number is too big or
1256 	 * too little.  If it's too little, then we need to indicate
1257 	 * that the LSN is outdated.
1258 	 */
1259 	LOG_SYSTEM_LOCK(env);
1260 	lp = (LOG *)dblp->reginfo.primary;
1261 	cfile = lp->lsn.file;
1262 	LOG_SYSTEM_UNLOCK(env);
1263 
1264 	if (cfile > fnum)
1265 		*outdatedp = 1;
1266 out:	__os_free(env, name);
1267 	return (ret);
1268 }
1269 
1270 /*
1271  * __log_zero --
1272  *	Zero out the tail of a log after a truncate.
1273  *
1274  * PUBLIC: int __log_zero __P((ENV *, DB_LSN *));
1275  */
1276 int
__log_zero(env,from_lsn)1277 __log_zero(env, from_lsn)
1278 	ENV *env;
1279 	DB_LSN *from_lsn;
1280 {
1281 	DB_FH *fhp;
1282 	DB_LOG *dblp;
1283 	LOG *lp;
1284 	struct __db_filestart *filestart, *nextstart;
1285 	size_t nbytes, len, nw;
1286 	u_int32_t fn, mbytes, bytes;
1287 	u_int8_t buf[4096];
1288 	int ret;
1289 	char *fname;
1290 
1291 	dblp = env->lg_handle;
1292 	lp = (LOG *)dblp->reginfo.primary;
1293 	DB_ASSERT(env, LOG_COMPARE(from_lsn, &lp->lsn) <= 0);
1294 	if (LOG_COMPARE(from_lsn, &lp->lsn) > 0) {
1295 		__db_errx(env, DB_STR("2534",
1296 		    "Warning: truncating to point beyond end of log"));
1297 		return (0);
1298 	}
1299 
1300 	if (lp->db_log_inmemory) {
1301 		/*
1302 		 * Remove the files that are invalidated by this truncate.
1303 		 */
1304 		for (filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1305 		    filestart != NULL; filestart = nextstart) {
1306 			nextstart = SH_TAILQ_NEXT(filestart,
1307 			    links, __db_filestart);
1308 			if (filestart->file > from_lsn->file) {
1309 				SH_TAILQ_REMOVE(&lp->logfiles,
1310 				    filestart, links, __db_filestart);
1311 				SH_TAILQ_INSERT_HEAD(&lp->free_logfiles,
1312 				    filestart, links, __db_filestart);
1313 			}
1314 		}
1315 
1316 		return (0);
1317 	}
1318 
1319 	/* Close any open file handles so unlinks don't fail. */
1320 	if (dblp->lfhp != NULL) {
1321 		(void)__os_closehandle(env, dblp->lfhp);
1322 		dblp->lfhp = NULL;
1323 	}
1324 
1325 	/* Throw away any extra log files that we have around. */
1326 	for (fn = from_lsn->file + 1;; fn++) {
1327 		if (__log_name(dblp, fn, &fname, &fhp, DB_OSO_RDONLY) != 0) {
1328 			__os_free(env, fname);
1329 			break;
1330 		}
1331 		(void)__os_closehandle(env, fhp);
1332 		(void)time(&lp->timestamp);
1333 		ret = __os_unlink(env, fname, 0);
1334 		__os_free(env, fname);
1335 		if (ret != 0)
1336 			return (ret);
1337 	}
1338 
1339 	/* We removed some log files; have to 0 to end of file. */
1340 	if ((ret =
1341 	    __log_name(dblp, from_lsn->file, &fname, &dblp->lfhp, 0)) != 0) {
1342 		__os_free(env, fname);
1343 		return (ret);
1344 	}
1345 	__os_free(env, fname);
1346 	if ((ret = __os_ioinfo(env,
1347 	    NULL, dblp->lfhp, &mbytes, &bytes, NULL)) != 0)
1348 		goto err;
1349 	DB_ASSERT(env, (mbytes * MEGABYTE + bytes) >= from_lsn->offset);
1350 	len = (mbytes * MEGABYTE + bytes) - from_lsn->offset;
1351 
1352 	memset(buf, 0, sizeof(buf));
1353 
1354 	/* Initialize the write position. */
1355 	if ((ret = __os_seek(env, dblp->lfhp, 0, 0, from_lsn->offset)) != 0)
1356 		goto err;
1357 
1358 	while (len > 0) {
1359 		nbytes = len > sizeof(buf) ? sizeof(buf) : len;
1360 		if ((ret =
1361 		    __os_write(env, dblp->lfhp, buf, nbytes, &nw)) != 0)
1362 			goto err;
1363 		len -= nbytes;
1364 	}
1365 
1366 err:	(void)__os_closehandle(env, dblp->lfhp);
1367 	dblp->lfhp = NULL;
1368 
1369 	return (ret);
1370 }
1371 
1372 /*
1373  * __log_inmem_lsnoff --
1374  *	Find the offset in the buffer of a given LSN.
1375  *
1376  * PUBLIC: int __log_inmem_lsnoff __P((DB_LOG *, DB_LSN *, size_t *));
1377  */
1378 int
__log_inmem_lsnoff(dblp,lsnp,offsetp)1379 __log_inmem_lsnoff(dblp, lsnp, offsetp)
1380 	DB_LOG *dblp;
1381 	DB_LSN *lsnp;
1382 	size_t *offsetp;
1383 {
1384 	LOG *lp;
1385 	struct __db_filestart *filestart;
1386 
1387 	lp = (LOG *)dblp->reginfo.primary;
1388 
1389 	SH_TAILQ_FOREACH(filestart, &lp->logfiles, links, __db_filestart)
1390 		if (filestart->file == lsnp->file) {
1391 			*offsetp = (u_int32_t)
1392 			    (filestart->b_off + lsnp->offset) % lp->buffer_size;
1393 			return (0);
1394 		}
1395 
1396 	return (DB_NOTFOUND);
1397 }
1398 
1399 /*
1400  * __log_inmem_newfile --
1401  *	Records the offset of the beginning of a new file in the in-memory
1402  *	buffer.
1403  *
1404  * PUBLIC: int __log_inmem_newfile __P((DB_LOG *, u_int32_t));
1405  */
1406 int
__log_inmem_newfile(dblp,file)1407 __log_inmem_newfile(dblp, file)
1408 	DB_LOG *dblp;
1409 	u_int32_t file;
1410 {
1411 	HDR hdr;
1412 	LOG *lp;
1413 	struct __db_filestart *filestart;
1414 	int ret;
1415 #ifdef DIAGNOSTIC
1416 	struct __db_filestart *first, *last;
1417 #endif
1418 
1419 	lp = (LOG *)dblp->reginfo.primary;
1420 
1421 	/*
1422 	 * If the log buffer is empty, reuse the filestart entry.
1423 	 */
1424 	filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1425 	if (filestart != NULL &&
1426 	    RINGBUF_LEN(lp, filestart->b_off, lp->b_off) <=
1427 	    sizeof(HDR) + sizeof(LOGP)) {
1428 		filestart->file = file;
1429 		filestart->b_off = lp->b_off;
1430 		return (0);
1431 	}
1432 
1433 	/*
1434 	 * We write an empty header at the end of every in-memory log file.
1435 	 * This is used during cursor traversal to indicate when to switch the
1436 	 * LSN to the next file.
1437 	 */
1438 	if (file > 1) {
1439 		memset(&hdr, 0, sizeof(HDR));
1440 		__log_inmem_copyin(dblp, lp->b_off, &hdr, sizeof(HDR));
1441 		lp->b_off = (lp->b_off + sizeof(HDR)) % lp->buffer_size;
1442 	}
1443 
1444 	filestart = SH_TAILQ_FIRST(&lp->free_logfiles, __db_filestart);
1445 	if (filestart == NULL) {
1446 		if ((ret = __env_alloc(&dblp->reginfo,
1447 		    sizeof(struct __db_filestart), &filestart)) != 0)
1448 			return (ret);
1449 		memset(filestart, 0, sizeof(*filestart));
1450 	} else
1451 		SH_TAILQ_REMOVE(&lp->free_logfiles, filestart,
1452 		    links, __db_filestart);
1453 
1454 	filestart->file = file;
1455 	filestart->b_off = lp->b_off;
1456 
1457 #ifdef DIAGNOSTIC
1458 	first = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1459 	last = SH_TAILQ_LAST(&(lp)->logfiles, links, __db_filestart);
1460 
1461 	/* Check that we don't wrap. */
1462 	DB_ASSERT(dblp->env, !first || first == last ||
1463 	    RINGBUF_LEN(lp, first->b_off, lp->b_off) ==
1464 	    RINGBUF_LEN(lp, first->b_off, last->b_off) +
1465 	    RINGBUF_LEN(lp, last->b_off, lp->b_off));
1466 #endif
1467 
1468 	SH_TAILQ_INSERT_TAIL(&lp->logfiles, filestart, links);
1469 	return (0);
1470 }
1471 
1472 /*
1473  * __log_inmem_chkspace --
1474  *	Ensure that the requested amount of space is available in the buffer,
1475  *	and invalidate the region.
1476  *      Note: assumes that the region lock is held on entry.
1477  *
1478  * PUBLIC: int __log_inmem_chkspace __P((DB_LOG *, size_t));
1479  */
1480 int
__log_inmem_chkspace(dblp,len)1481 __log_inmem_chkspace(dblp, len)
1482 	DB_LOG *dblp;
1483 	size_t len;
1484 {
1485 	DB_LSN active_lsn, old_active_lsn;
1486 	ENV *env;
1487 	LOG *lp;
1488 	struct __db_filestart *filestart;
1489 	size_t offset;
1490 	int ret;
1491 
1492 	env = dblp->env;
1493 	lp = dblp->reginfo.primary;
1494 
1495 	DB_ASSERT(env, lp->db_log_inmemory);
1496 
1497 	/*
1498 	 * Allow room for an extra header so that we don't need to check for
1499 	 * space when switching files.
1500 	 */
1501 	len += sizeof(HDR);
1502 
1503 	/*
1504 	 * If transactions are enabled and we're about to fill available space,
1505 	 * update the active LSN and recheck.  If transactions aren't enabled,
1506 	 * don't even bother checking: in that case we can always overwrite old
1507 	 * log records, because we're never going to abort.
1508 	 */
1509 	while (TXN_ON(env) &&
1510 	    RINGBUF_LEN(lp, lp->b_off, lp->a_off) <= len) {
1511 		old_active_lsn = lp->active_lsn;
1512 		active_lsn = lp->lsn;
1513 
1514 		/*
1515 		 * Drop the log region lock so we don't hold it while
1516 		 * taking the transaction region lock.
1517 		 */
1518 		LOG_SYSTEM_UNLOCK(env);
1519 		ret = __txn_getactive(env, &active_lsn);
1520 		LOG_SYSTEM_LOCK(env);
1521 		if (ret != 0)
1522 			return (ret);
1523 		active_lsn.offset = 0;
1524 
1525 		/* If we didn't make any progress, give up. */
1526 		if (LOG_COMPARE(&active_lsn, &old_active_lsn) == 0) {
1527 			__db_errx(env, DB_STR("2535",
1528 "In-memory log buffer is full (an active transaction spans the buffer)"));
1529 			return (DB_LOG_BUFFER_FULL);
1530 		}
1531 
1532 		/* Make sure we're moving the region LSN forwards. */
1533 		if (LOG_COMPARE(&active_lsn, &lp->active_lsn) > 0) {
1534 			lp->active_lsn = active_lsn;
1535 			offset = lp->a_off;
1536 			(void)__log_inmem_lsnoff(dblp, &active_lsn, &offset);
1537 			lp->a_off = (db_size_t)offset;
1538 		}
1539 	}
1540 
1541 	/*
1542 	 * Remove the first file if it is invalidated by this write.
1543 	 * Log records can't be bigger than a file, so we only need to
1544 	 * check the first file.
1545 	 */
1546 	filestart = SH_TAILQ_FIRST(&lp->logfiles, __db_filestart);
1547 	if (filestart != NULL &&
1548 	    RINGBUF_LEN(lp, lp->b_off, filestart->b_off) <= len) {
1549 		SH_TAILQ_REMOVE(&lp->logfiles, filestart,
1550 		    links, __db_filestart);
1551 		SH_TAILQ_INSERT_HEAD(&lp->free_logfiles, filestart,
1552 		    links, __db_filestart);
1553 		lp->f_lsn.file = filestart->file + 1;
1554 	}
1555 
1556 	return (0);
1557 }
1558 
1559 /*
1560  * __log_inmem_copyout --
1561  *	Copies the given number of bytes from the buffer -- no checking.
1562  *      Note: assumes that the region lock is held on entry.
1563  *
1564  * PUBLIC: void __log_inmem_copyout __P((DB_LOG *, size_t, void *, size_t));
1565  */
1566 void
__log_inmem_copyout(dblp,offset,buf,size)1567 __log_inmem_copyout(dblp, offset, buf, size)
1568 	DB_LOG *dblp;
1569 	size_t offset;
1570 	void *buf;
1571 	size_t size;
1572 {
1573 	LOG *lp;
1574 	size_t nbytes;
1575 
1576 	lp = (LOG *)dblp->reginfo.primary;
1577 	nbytes = (offset + size < lp->buffer_size) ?
1578 	    size : lp->buffer_size - offset;
1579 	memcpy(buf, dblp->bufp + offset, nbytes);
1580 	if (nbytes < size)
1581 		memcpy((u_int8_t *)buf + nbytes, dblp->bufp, size - nbytes);
1582 }
1583 
1584 /*
1585  * __log_inmem_copyin --
1586  *	Copies the given number of bytes into the buffer -- no checking.
1587  *      Note: assumes that the region lock is held on entry.
1588  *
1589  * PUBLIC: void __log_inmem_copyin __P((DB_LOG *, size_t, void *, size_t));
1590  */
1591 void
__log_inmem_copyin(dblp,offset,buf,size)1592 __log_inmem_copyin(dblp, offset, buf, size)
1593 	DB_LOG *dblp;
1594 	size_t offset;
1595 	void *buf;
1596 	size_t size;
1597 {
1598 	LOG *lp;
1599 	size_t nbytes;
1600 
1601 	lp = (LOG *)dblp->reginfo.primary;
1602 	nbytes = (offset + size < lp->buffer_size) ?
1603 	    size : lp->buffer_size - offset;
1604 	memcpy(dblp->bufp + offset, buf, nbytes);
1605 	if (nbytes < size)
1606 		memcpy(dblp->bufp, (u_int8_t *)buf + nbytes, size - nbytes);
1607 }
1608 
1609 /*
1610  * __log_set_version --
1611  *	Sets the current version of the log subsystem to the given version.
1612  *	Essentially this modifies the lp->persist.version field in the
1613  *	shared memory region.  Called when region is initially created
1614  *	and when replication is starting up or finds a new master.
1615  *
1616  * PUBLIC: void __log_set_version __P((ENV *, u_int32_t));
1617  */
1618 void
__log_set_version(env,newver)1619 __log_set_version(env, newver)
1620 	ENV *env;
1621 	u_int32_t newver;
1622 {
1623 	DB_LOG *dblp;
1624 	LOG *lp;
1625 
1626 	dblp = env->lg_handle;
1627 	lp = (LOG *)dblp->reginfo.primary;
1628 	/*
1629 	 * We should be able to update this atomically without locking.
1630 	 */
1631 	lp->persist.version = newver;
1632 }
1633 
1634 /*
1635  * __log_get_oldversion --
1636  *	Returns the last version of log that this environment was working
1637  *	with.  Since there could be several versions of log files, if
1638  *	the user upgraded and didn't log archive, we check the version
1639  *	of the first log file, compare it to the last log file.  If those
1640  *	are different, then there is an older log existing, and we then
1641  *	walk backward in the log files looking for the version of the
1642  *	most recent older log file.
1643  *
1644  * PUBLIC: int __log_get_oldversion __P((ENV *, u_int32_t *));
1645  */
1646 int
__log_get_oldversion(env,ver)1647 __log_get_oldversion(env, ver)
1648 	ENV *env;
1649 	u_int32_t *ver;
1650 {
1651 	DBT rec;
1652 	DB_LOG *dblp;
1653 	DB_LOGC *logc;
1654 	DB_LSN lsn;
1655 	LOG *lp;
1656 	u_int32_t firstfnum, fnum, lastver, oldver;
1657 	int ret, t_ret;
1658 
1659 	dblp = env->lg_handle;
1660 	lp = dblp->reginfo.primary;
1661 
1662 	logc = NULL;
1663 	ret = 0;
1664 	oldver = DB_LOGVERSION;
1665 	/*
1666 	 * If we're in-memory logs we're always the current version.
1667 	 */
1668 	if (lp->db_log_inmemory) {
1669 		*ver = oldver;
1670 		return (0);
1671 	}
1672 	memset(&rec, 0, sizeof(rec));
1673 	if ((ret = __log_cursor(env, &logc)) != 0)
1674 		goto err;
1675 	/*
1676 	 * Get the version numbers of the first and last log files.
1677 	 */
1678 	if ((ret = __logc_get(logc, &lsn, &rec, DB_FIRST)) != 0) {
1679 		/*
1680 		 * If there is no log file, we'll get DB_NOTFOUND.
1681 		 * If we get that, set the version to the current.
1682 		 */
1683 		if (ret == DB_NOTFOUND)
1684 			ret = 0;
1685 		goto err;
1686 	}
1687 	firstfnum = lsn.file;
1688 	if ((ret = __logc_get(logc, &lsn, &rec, DB_LAST)) != 0)
1689 		goto err;
1690 	if ((ret = __log_valid(dblp, firstfnum, 0, NULL, 0,
1691 	    NULL, &oldver)) != 0)
1692 		goto err;
1693 	/*
1694 	 * If the first and last LSN are in the same file, then we
1695 	 * already have the version in oldver.  Return it.
1696 	 */
1697 	if (firstfnum == lsn.file)
1698 		goto err;
1699 
1700 	/*
1701 	 * Otherwise they're in different files and we call __log_valid
1702 	 * to get the version numbers in both files.
1703 	 */
1704 	if ((ret = __log_valid(dblp, lsn.file, 0, NULL, 0,
1705 	    NULL, &lastver)) != 0)
1706 		goto err;
1707 	/*
1708 	 * If the version numbers are different, walk backward getting
1709 	 * the version of each log file until we find one that is
1710 	 * different than the last.
1711 	 */
1712 	if (oldver != lastver) {
1713 		for (fnum = lsn.file - 1; fnum >= firstfnum; fnum--) {
1714 			if ((ret = __log_valid(dblp, fnum, 0, NULL, 0,
1715 			    NULL, &oldver)) != 0)
1716 				goto err;
1717 			if (oldver != lastver)
1718 				break;
1719 		}
1720 	}
1721 err:	if (logc != NULL && ((t_ret = __logc_close(logc)) != 0) && ret == 0)
1722 		ret = t_ret;
1723 	if (ret == 0 && ver != NULL)
1724 		*ver = oldver;
1725 	return (ret);
1726 }
1727