1 /*-
2  * Copyright (c) 1996, 2020 Oracle and/or its affiliates.  All rights reserved.
3  *
4  * See the file LICENSE for license information.
5  *
6  * $Id$
7  */
8 
9 #include "db_config.h"
10 
11 #include "db_int.h"
12 #include "dbinc/crypto.h"
13 #include "dbinc/hmac.h"
14 #include "dbinc/log.h"
15 #include "dbinc/txn.h"
16 #include "dbinc/db_page.h"
17 #include "dbinc_auto/db_ext.h"
18 
19 static int __log_encrypt_record __P((ENV *, DBT *, HDR *, u_int32_t));
20 static int __log_file __P((ENV *, const DB_LSN *, char *, size_t));
21 static int __log_fill __P((DB_LOG *, DB_LSN *, void *, u_int32_t));
22 static int __log_flush_commit __P((ENV *, const DB_LSN *, u_int32_t));
23 static int __log_newfh __P((DB_LOG *, int));
24 static int __log_put_next __P((ENV *,
25     DB_LSN *, const DBT *, HDR *, DB_LSN *));
26 static int __log_put_record_int __P((ENV *, DB *, DB_TXN *, DB_LSN *,
27     u_int32_t, u_int32_t, u_int32_t, u_int32_t, DB_LOG_RECSPEC *, va_list));
28 static int __log_putr __P((DB_LOG *,
29     DB_LSN *, const DBT *, u_int32_t, HDR *));
30 static int __log_write __P((DB_LOG *, void *, u_int32_t));
31 
32 /*
33  * __log_put_pp --
34  *	ENV->log_put pre/post processing.
35  *
36  * PUBLIC: int __log_put_pp __P((DB_ENV *, DB_LSN *, const DBT *, u_int32_t));
37  */
38 int
__log_put_pp(dbenv,lsnp,udbt,flags)39 __log_put_pp(dbenv, lsnp, udbt, flags)
40 	DB_ENV *dbenv;
41 	DB_LSN *lsnp;
42 	const DBT *udbt;
43 	u_int32_t flags;
44 {
45 	DB_THREAD_INFO *ip;
46 	ENV *env;
47 	int ret;
48 
49 	env = dbenv->env;
50 
51 	ENV_REQUIRES_CONFIG(env,
52 	    env->lg_handle, "DB_ENV->log_put", DB_INIT_LOG);
53 
54 	/* Validate arguments: check for allowed flags. */
55 	if ((ret = __db_fchk(env, "DB_ENV->log_put", flags,
56 	    DB_LOG_CHKPNT | DB_LOG_COMMIT |
57 	    DB_FLUSH | DB_LOG_NOCOPY | DB_LOG_WRNOSYNC)) != 0)
58 		return (ret);
59 
60 	/* DB_LOG_WRNOSYNC and DB_FLUSH are mutually exclusive. */
61 	if (LF_ISSET(DB_LOG_WRNOSYNC) && LF_ISSET(DB_FLUSH))
62 		return (__db_ferr(env, "DB_ENV->log_put", 1));
63 
64 	/* Replication clients should never write log records. */
65 	if (IS_REP_CLIENT(env)) {
66 		__db_errx(env, DB_STR("2511",
67 		    "DB_ENV->log_put is illegal on replication clients"));
68 		return (EINVAL);
69 	}
70 
71 	ENV_ENTER(env, ip);
72 	REPLICATION_WRAP(env, (__log_put(env, lsnp, udbt, flags)), 0, ret);
73 	ENV_LEAVE(env, ip);
74 	return (ret);
75 }
76 
77 /*
78  * __log_put --
79  *	ENV->log_put.
80  *
81  * PUBLIC: int __log_put __P((ENV *, DB_LSN *, const DBT *, u_int32_t));
82  */
83 int
__log_put(env,lsnp,udbt,flags)84 __log_put(env, lsnp, udbt, flags)
85 	ENV *env;
86 	DB_LSN *lsnp;
87 	const DBT *udbt;
88 	u_int32_t flags;
89 {
90 	DBT *dbt, t;
91 	DB_CIPHER *db_cipher;
92 	DB_LOG *dblp;
93 	DB_LSN lsn, old_lsn;
94 	DB_REP *db_rep;
95 	HDR hdr;
96 	LOG *lp;
97 	REP *rep;
98 	int lock_held, need_free, ret;
99 	u_int8_t *key;
100 
101 	COMPQUIET(rep, NULL);
102 
103 	dblp = env->lg_handle;
104 	lp = dblp->reginfo.primary;
105 	db_cipher = env->crypto_handle;
106 	db_rep = env->rep_handle;
107 	if (db_rep != NULL)
108 		rep = db_rep->region;
109 	else
110 		rep = NULL;
111 
112 	dbt = &t;
113 	t = *udbt;
114 	lock_held = need_free = 0;
115 	ZERO_LSN(old_lsn);
116 	hdr.len = hdr.prev = 0;
117 
118 	/*
119 	 * In general, if we are not a rep application, but are sharing a master
120 	 * rep env, we should not be writing log records.  However, we can allow
121 	 * a non-replication-aware process to join a pre-existing repmgr
122 	 * environment, if env handle meets repmgr's DB_THREAD requirement.
123 	 */
124 
125 	if (IS_REP_MASTER(env) && db_rep->send == NULL) {
126 #ifdef HAVE_REPLICATION_THREADS
127 		if (F_ISSET(env, ENV_THREAD) && APP_IS_REPMGR(env)) {
128 			if ((ret = __repmgr_autostart(env)) != 0)
129 				return (ret);
130 		} else
131 #endif
132 		{
133 #if !defined(DEBUG_ROP) && !defined(DEBUG_WOP)
134 			__db_errx(env, DB_STR("2512",
135 			    "Non-replication DB_ENV handle attempting "
136 			    "to modify a replicated environment"));
137 			return (EINVAL);
138 #endif
139 		}
140 	}
141 
142 	if (IS_REP_CLIENT(env)) {
143 		__db_errx(env, DB_STR("2590",
144 			    "log_put is illegal on replication clients"));
145 #if  !defined(DIAGNOSTIC)
146 		/*
147 		 * DB_ASSERT would generate a stack if DIAGNOSTIC is true.
148 		 */
149 		__os_stack(env);
150 		return (__env_panic(env, EINVAL));
151 #endif
152 
153 		DB_ASSERT(env, FALSE);
154 	}
155 
156 	/*
157 	 * If we are coming from the logging code, we use an internal flag,
158 	 * DB_LOG_NOCOPY, because we know we can overwrite/encrypt the log
159 	 * record in place.  Otherwise, if a user called log_put then we
160 	 * must copy it to new memory so that we know we can write it.
161 	 *
162 	 * We also must copy it to new memory if we are a replication master
163 	 * so that we retain an unencrypted copy of the log record to send
164 	 * to clients.
165 	 */
166 	if (!LF_ISSET(DB_LOG_NOCOPY) || IS_REP_MASTER(env)) {
167 		if (CRYPTO_ON(env))
168 			t.size += db_cipher->adj_size(udbt->size);
169 		if ((ret = __os_calloc(env, 1, t.size, &t.data)) != 0)
170 			goto err;
171 		need_free = 1;
172 		memcpy(t.data, udbt->data, udbt->size);
173 	}
174 	if ((ret = __log_encrypt_record(env, dbt, &hdr, udbt->size)) != 0)
175 		goto err;
176 	if (CRYPTO_ON(env))
177 		key = db_cipher->mac_key;
178 	else
179 		key = NULL;
180 #ifdef HAVE_LOG_CHECKSUM
181 	__db_chksum(&hdr, dbt->data, dbt->size, key, hdr.chksum);
182 #endif
183 
184 	LOG_SYSTEM_LOCK(env);
185 	lock_held = 1;
186 
187 	if ((ret = __log_put_next(env, &lsn, dbt, &hdr, &old_lsn)) != 0)
188 		goto panic_check;
189 
190 	/*
191 	 * Assign the return LSN before dropping the region lock.  Necessary
192 	 * in case the lsn is a begin_lsn from a TXN_DETAIL structure passed in
193 	 * by the logging routines.  We use atomic 32-bit operations because
194 	 * during commit this will be a TXN_DETAIL visible_lsn field, and MVCC
195 	 * relies on reading the fields atomically.
196 	 */
197 	lsnp->file = lsn.file;
198 	lsnp->offset = lsn.offset;
199 
200 #ifdef HAVE_REPLICATION
201 	if (IS_REP_MASTER(env)) {
202 		__rep_newfile_args nf_args;
203 		DBT newfiledbt;
204 		REP_BULK bulk;
205 		size_t len;
206 		u_int32_t ctlflags;
207 		u_int8_t buf[__REP_NEWFILE_SIZE];
208 
209 		/*
210 		 * Replication masters need to drop the lock to send messages,
211 		 * but want to drop and reacquire it a minimal number of times.
212 		 */
213 		ctlflags = LF_ISSET(DB_LOG_COMMIT | DB_LOG_CHKPNT) ?
214 		    REPCTL_PERM : 0;
215 		LOG_SYSTEM_UNLOCK(env);
216 		lock_held = 0;
217 		if (LF_ISSET(DB_FLUSH))
218 			ctlflags |= REPCTL_FLUSH;
219 
220 		/*
221 		 * If we changed files and we're in a replicated environment,
222 		 * we need to inform our clients now that we've dropped the
223 		 * region lock.
224 		 *
225 		 * Note that a failed NEWFILE send is a dropped message that
226 		 * our client can handle, so we can ignore it.  It's possible
227 		 * that the record we already put is a commit, so we don't just
228 		 * want to return failure.
229 		 */
230 		if (!IS_ZERO_LSN(old_lsn)) {
231 			memset(&newfiledbt, 0, sizeof(newfiledbt));
232 			nf_args.version = lp->persist.version;
233 			(void)__rep_newfile_marshal(env, &nf_args,
234 			    buf, __REP_NEWFILE_SIZE, &len);
235 			DB_INIT_DBT(newfiledbt, buf, len);
236 			(void)__rep_send_message(env, DB_EID_BROADCAST,
237 			    REP_NEWFILE, &old_lsn, &newfiledbt, 0, 0);
238 		}
239 
240 		/*
241 		 * If we're doing bulk processing put it in the bulk buffer.
242 		 */
243 		ret = 0;
244 		if (FLD_ISSET(rep->config, REP_C_BULK)) {
245 			/*
246 			 * Bulk could have been turned on by another process.
247 			 * If so, set the address into the bulk region now.
248 			 */
249 			if (db_rep->bulk == NULL)
250 				db_rep->bulk = R_ADDR(&dblp->reginfo,
251 				    lp->bulk_buf);
252 			memset(&bulk, 0, sizeof(bulk));
253 			bulk.addr = db_rep->bulk;
254 			bulk.offp = &lp->bulk_off;
255 			bulk.len = lp->bulk_len;
256 			bulk.lsn = lsn;
257 			bulk.type = REP_BULK_LOG;
258 			bulk.eid = DB_EID_BROADCAST;
259 			bulk.flagsp = &lp->bulk_flags;
260 			ret = __rep_bulk_message(env, &bulk, NULL,
261 			    &lsn, udbt, ctlflags);
262 		}
263 		if (!FLD_ISSET(rep->config, REP_C_BULK) ||
264 		    ret == DB_REP_BULKOVF) {
265 			/*
266 			 * Then send the log record itself on to our clients.
267 			 */
268 			/*
269 			 * !!!
270 			 * In the crypto case, we MUST send the udbt, not the
271 			 * now-encrypted dbt.  Clients have no way to decrypt
272 			 * without the header.
273 			 */
274 			ret = __rep_send_message(env, DB_EID_BROADCAST,
275 			    REP_LOG, &lsn, udbt, ctlflags, 0);
276 		}
277 		if (FLD_ISSET(ctlflags, REPCTL_PERM)) {
278 			LOG_SYSTEM_LOCK(env);
279 #ifdef HAVE_STATISTICS
280 			if (IS_USING_LEASES(env))
281 				rep->stat.st_lease_sends++;
282 #endif
283 			/*
284 			 * Keep track of our last PERM lsn.  Set this on a
285 			 * master under the log lock.  When using leases, if
286 			 * we set max_perm_lsn too early (before the send)
287 			 * then we hit a lot of false invalid lease checks
288 			 * which all try to refresh and hurt performance.
289 			 */
290 			if (LOG_COMPARE(&lp->max_perm_lsn, &lsn) < 0)
291 				lp->max_perm_lsn = lsn;
292 			LOG_SYSTEM_UNLOCK(env);
293 		}
294 		/*
295 		 * If the send fails and we're a commit or checkpoint,
296 		 * there's nothing we can do;  the record's in the log.
297 		 * Flush it, even if we're running with TXN_NOSYNC,
298 		 * on the grounds that it should be in durable form somewhere.
299 		 */
300 		if (ret != 0 && FLD_ISSET(ctlflags, REPCTL_PERM))
301 			LF_SET(DB_FLUSH);
302 		/*
303 		 * We ignore send failures so reset 'ret' to 0 here.
304 		 * We needed to check special return values from
305 		 * bulk transfer and errors from either bulk or normal
306 		 * message sending need flushing on perm records.  But
307 		 * otherwise we need to ignore it and reset it now.
308 		 */
309 		ret = 0;
310 	}
311 #endif
312 
313 	/*
314 	 * If needed, do a flush.  Note that failures at this point
315 	 * are only permissible if we know we haven't written a commit
316 	 * record;  __log_flush_commit is responsible for enforcing this.
317 	 *
318 	 * If a flush is not needed, see if WRITE_NOSYNC was set and we
319 	 * need to write out the log buffer.
320 	 */
321 	if (LF_ISSET(DB_FLUSH | DB_LOG_WRNOSYNC)) {
322 		if (!lock_held) {
323 			LOG_SYSTEM_LOCK(env);
324 			lock_held = 1;
325 		}
326 		if ((ret = __log_flush_commit(env, &lsn, flags)) != 0)
327 			goto panic_check;
328 	}
329 
330 	/*
331 	 * If flushed a checkpoint record, reset the "bytes since the last
332 	 * checkpoint" counters.
333 	 */
334 	if (LF_ISSET(DB_LOG_CHKPNT))
335 		lp->stat.st_wc_bytes = lp->stat.st_wc_mbytes = 0;
336 
337 	/* Increment count of records added to the log. */
338 	STAT(++lp->stat.st_record);
339 
340 	if (0) {
341 panic_check:	/*
342 		 * Writing log records cannot fail if we're a replication
343 		 * master.  The reason is that once we send the record to
344 		 * replication clients, the transaction can no longer
345 		 * abort, otherwise the master would be out of sync with
346 		 * the rest of the replication group.  Panic the system.
347 		 */
348 		if (ret != 0 && IS_REP_MASTER(env))
349 			ret = __env_panic(env, ret);
350 	}
351 
352 err:	if (lock_held)
353 		LOG_SYSTEM_UNLOCK(env);
354 	if (need_free)
355 		__os_free(env, dbt->data);
356 
357 	/*
358 	 * If auto-remove is set and we switched files, remove unnecessary
359 	 * log files.
360 	 */
361 	if (ret == 0 && !IS_ZERO_LSN(old_lsn) && lp->db_log_autoremove)
362 		__log_autoremove(env);
363 
364 	return (ret);
365 }
366 
367 /*
368  * __log_current_lsn_int --
369  *	internal operations of __log_current_lsn
370  *
371  * PUBLIC: int __log_current_lsn_int
372  * PUBLIC:     __P((ENV *, DB_LSN *, u_int32_t *, u_int32_t *));
373  */
374 int
__log_current_lsn_int(env,lsnp,mbytesp,bytesp)375 __log_current_lsn_int(env, lsnp, mbytesp, bytesp)
376 	ENV *env;
377 	DB_LSN *lsnp;
378 	u_int32_t *mbytesp, *bytesp;
379 {
380 	DB_LOG *dblp;
381 	LOG *lp;
382 
383 	dblp = env->lg_handle;
384 	lp = dblp->reginfo.primary;
385 
386 	LOG_SYSTEM_LOCK(env);
387 
388 	/*
389 	 * We need the LSN of the last entry in the log.
390 	 *
391 	 * Typically, it's easy to get the last written LSN, you simply look
392 	 * at the current log pointer and back up the number of bytes of the
393 	 * last log record.  However, if the last thing we did was write the
394 	 * log header of a new log file, then, this doesn't work, so we return
395 	 * the first log record that will be written in this new file.
396 	 */
397 	*lsnp = lp->lsn;
398 	if (lp->lsn.offset > lp->len)
399 		lsnp->offset -= lp->len;
400 
401 	/*
402 	 * Since we're holding the log region lock, return the bytes put into
403 	 * the log since the last checkpoint, transaction checkpoint needs it.
404 	 *
405 	 * We add the current buffer offset so as to count bytes that have not
406 	 * yet been written, but are sitting in the log buffer.
407 	 */
408 	if (mbytesp != NULL) {
409 		*mbytesp = lp->stat.st_wc_mbytes;
410 		*bytesp = (u_int32_t)(lp->stat.st_wc_bytes + lp->b_off);
411 	}
412 
413 	LOG_SYSTEM_UNLOCK(env);
414 
415 	return (0);
416 }
417 
418 /*
419  * __log_current_lsn --
420  *	Return the current LSN.
421  *
422  * PUBLIC: int __log_current_lsn
423  * PUBLIC:     __P((ENV *, DB_LSN *, u_int32_t *, u_int32_t *));
424  */
425 int
__log_current_lsn(env,lsnp,mbytesp,bytesp)426 __log_current_lsn(env, lsnp, mbytesp, bytesp)
427 	ENV *env;
428 	DB_LSN *lsnp;
429 	u_int32_t *mbytesp, *bytesp;
430 {
431 	DB_THREAD_INFO *ip;
432 	int ret;
433 
434 	ret = 0;
435 	ENV_ENTER(env, ip);
436 	ret = __log_current_lsn_int(env, lsnp, mbytesp, bytesp);
437 	ENV_LEAVE(env, ip);
438 
439 	return ret;
440 }
441 
442 /*
443  * __log_put_next --
444  *	Put the given record as the next in the log, wherever that may
445  * turn out to be.
446  */
447 static int
__log_put_next(env,lsn,dbt,hdr,old_lsnp)448 __log_put_next(env, lsn, dbt, hdr, old_lsnp)
449 	ENV *env;
450 	DB_LSN *lsn;
451 	const DBT *dbt;
452 	HDR *hdr;
453 	DB_LSN *old_lsnp;
454 {
455 	DB_LOG *dblp;
456 	DB_LSN old_lsn;
457 	LOG *lp;
458 	int adv_file, newfile, ret;
459 
460 	dblp = env->lg_handle;
461 	lp = dblp->reginfo.primary;
462 
463 	/*
464 	 * Save a copy of lp->lsn before we might decide to switch log
465 	 * files and change it.  If we do switch log files, and we're
466 	 * doing replication, we'll need to tell our clients about the
467 	 * switch, and they need to receive a NEWFILE message
468 	 * with this "would-be" LSN in order to know they're not
469 	 * missing any log records.
470 	 */
471 	old_lsn = lp->lsn;
472 	newfile = 0;
473 	adv_file = 0;
474 	/*
475 	 * If our current log is at an older version and we want to write
476 	 * a record then we need to advance the log.
477 	 */
478 	if (lp->persist.version != DB_LOGVERSION) {
479 		__log_set_version(env, DB_LOGVERSION);
480 		adv_file = 1;
481 	}
482 
483 	/*
484 	 * If this information won't fit in the file, or if we're a
485 	 * replication client environment and have been told to do so,
486 	 * swap files.
487 	 */
488 	if (adv_file || lp->lsn.offset == 0 ||
489 	    lp->lsn.offset + hdr->size + dbt->size > lp->log_size) {
490 		if (hdr->size + sizeof(LOGP) + dbt->size > lp->log_nsize) {
491 			__db_errx(env, DB_STR_A("2513",
492 	    "DB_ENV->log_put: record larger than maximum file size (%lu > %lu)",
493 			    "%lu %lu"),
494 			    (u_long)hdr->size + sizeof(LOGP) + dbt->size,
495 			    (u_long)lp->log_nsize);
496 			return (EINVAL);
497 		}
498 
499 		if ((ret = __log_newfile(dblp, NULL, 0, 0)) != 0)
500 			return (ret);
501 
502 		/*
503 		 * Flag that we switched files, in case we're a master
504 		 * and need to send this information to our clients.
505 		 * We postpone doing the actual send until we can
506 		 * safely release the log region lock and are doing so
507 		 * anyway.
508 		 */
509 		newfile = 1;
510 	}
511 
512 	/* If we switched log files, let our caller know where. */
513 	if (newfile)
514 		*old_lsnp = old_lsn;
515 
516 	/* Actually put the record. */
517 	return (__log_putr(dblp, lsn, dbt, lp->lsn.offset - lp->len, hdr));
518 }
519 
520 /*
521  * __log_flush_commit --
522  *	Flush a record.
523  */
524 static int
__log_flush_commit(env,lsnp,flags)525 __log_flush_commit(env, lsnp, flags)
526 	ENV *env;
527 	const DB_LSN *lsnp;
528 	u_int32_t flags;
529 {
530 	DB_LOG *dblp;
531 	DB_LSN flush_lsn;
532 	HDR hdr;
533 	LOG *lp;
534 	int ret, t_ret;
535 	size_t nr, nw;
536 	u_int8_t *buffer;
537 
538 	dblp = env->lg_handle;
539 	lp = dblp->reginfo.primary;
540 	flush_lsn = *lsnp;
541 
542 	ret = 0;
543 
544 	/*
545 	 * DB_FLUSH:
546 	 *	Flush a record for which the DB_FLUSH flag to log_put was set.
547 	 *
548 	 * DB_LOG_WRNOSYNC:
549 	 *	If there's anything in the current log buffer, write it out.
550 	 */
551 	if (LF_ISSET(DB_FLUSH))
552 		ret = __log_flush_int(dblp, &flush_lsn, 1);
553 	else if (!lp->db_log_inmemory && lp->b_off != 0)
554 		if ((ret = __log_write(dblp,
555 		    dblp->bufp, (u_int32_t)lp->b_off)) == 0)
556 			lp->b_off = 0;
557 
558 	/*
559 	 * If a flush supporting a transaction commit fails, we must abort the
560 	 * transaction.  (If we aren't doing a commit, return the failure; if
561 	 * if the commit we care about made it to disk successfully, we just
562 	 * ignore the failure, because there's no way to undo the commit.)
563 	 */
564 	if (ret == 0 || !LF_ISSET(DB_LOG_COMMIT))
565 		return (ret);
566 
567 	if (LF_ISSET(DB_FLUSH) ?
568 	    flush_lsn.file != lp->s_lsn.file ||
569 	    flush_lsn.offset < lp->s_lsn.offset :
570 	    flush_lsn.file != lp->lsn.file || flush_lsn.offset < lp->w_off)
571 		return (0);
572 
573 	if (IS_REP_MASTER(env)) {
574 		__db_err(env, ret, DB_STR("2514",
575 		    "Write failed on MASTER commit."));
576 		return (__env_panic(env, ret));
577 	}
578 	/*
579 	 * If this is a panic don't attempt to abort just this transaction;
580 	 * it may trip over the panic, and the whole env needs to go anyway.
581 	 */
582 	if (ret == DB_RUNRECOVERY)
583 		return (__env_panic(env, ret));
584 	/*
585 	 * Else, make sure that the commit record does not get out after we
586 	 * abort the transaction.  Do this by overwriting the commit record
587 	 * in the buffer.  (Note that other commits in this buffer will wait
588 	 * until a successful write happens, we do not wake them.)  We point
589 	 * at the right part of the buffer and write an abort record over the
590 	 * commit.  We must then try and flush the buffer again, since the
591 	 * interesting part of the buffer may have actually made it out to
592 	 * disk before there was a failure, we can't know for sure.
593 	 */
594 	if (flush_lsn.offset > lp->w_off) {
595 		if ((t_ret = __txn_force_abort(env,
596 		     dblp->bufp + flush_lsn.offset - lp->w_off)) != 0)
597 			return (__env_panic(env, t_ret));
598 	} else {
599 		/*
600 		 * The buffer was written, but its not on disk, we
601 		 * must read it back and force things from a commit
602 		 * state to an abort state.  Lots of things could fail
603 		 * here and we will be left with a commit record but
604 		 * a panic return.
605 		 */
606 		 if (
607 		    (t_ret = __os_seek(env,
608 		    dblp->lfhp, 0, 0, flush_lsn.offset)) != 0 ||
609 		    (t_ret = __os_read(env, dblp->lfhp, &hdr,
610 		    HDR_NORMAL_SZ, &nr)) != 0 || nr != HDR_NORMAL_SZ)
611 			return (__env_panic(env, t_ret == 0 ? EIO : t_ret));
612 		if (LOG_SWAPPED(env))
613 			__log_hdrswap(&hdr, CRYPTO_ON(env));
614 		if ((t_ret = __os_malloc(env, hdr.len, &buffer)) != 0 ||
615 		    (t_ret = __os_seek(env,
616 		    dblp->lfhp, 0, 0, flush_lsn.offset)) != 0 ||
617 		    (t_ret = __os_read(env, dblp->lfhp, buffer,
618 		    hdr.len, &nr)) != 0 || nr != hdr.len ||
619 		    (t_ret = __txn_force_abort(env, buffer)) != 0 ||
620 		    (t_ret = __os_seek(env,
621 		    dblp->lfhp, 0, 0, flush_lsn.offset)) != 0 ||
622 		    (t_ret = __os_write(env, dblp->lfhp, buffer,
623 		    nr, &nw)) != 0 || nw != nr)
624 			return (__env_panic(env, t_ret == 0 ? EIO : t_ret));
625 		__os_free(env, buffer);
626 	}
627 	/*
628 	 * Try to flush the log again, if the disk just bounced then we
629 	 * want to be sure it does not go away again before we write the
630 	 * abort record.
631 	 */
632 	(void)__log_flush_int(dblp, &flush_lsn, 0);
633 
634 	return (ret);
635 }
636 
637 /*
638  * __log_newfile --
639  *	Initialize and switch to a new log file.  (Note that this is
640  * called both when no log yet exists and when we fill a log file.)
641  *
642  * PUBLIC: int __log_newfile __P((DB_LOG *, DB_LSN *, u_int32_t, u_int32_t));
643  */
644 int
__log_newfile(dblp,lsnp,logfile,version)645 __log_newfile(dblp, lsnp, logfile, version)
646 	DB_LOG *dblp;
647 	DB_LSN *lsnp;
648 	u_int32_t logfile;
649 	u_int32_t version;
650 {
651 	DBT t;
652 	DB_CIPHER *db_cipher;
653 	DB_LSN lsn;
654 	ENV *env;
655 	HDR hdr;
656 	LOG *lp;
657 	LOGP *tpersist;
658 	int need_free, ret;
659 	u_int32_t lastoff;
660 	size_t tsize;
661 
662 	env = dblp->env;
663 	lp = dblp->reginfo.primary;
664 
665 	/*
666 	 * If we're not specifying a specific log file number and we're
667 	 * not at the beginning of a file already, start a new one.
668 	 */
669 	if (logfile == 0 && lp->lsn.offset != 0) {
670 		/*
671 		 * Flush the log so this file is out and can be closed.  We
672 		 * cannot release the region lock here because we need to
673 		 * protect the end of the file while we switch.  In
674 		 * particular, a thread with a smaller record than ours
675 		 * could detect that there is space in the log. Even
676 		 * blocking that event by declaring the file full would
677 		 * require all threads to wait here so that the lsn.file
678 		 * can be moved ahead after the flush completes.  This
679 		 * probably can be changed if we had an lsn for the
680 		 * previous file and one for the current, but it does not
681 		 * seem like this would get much more throughput, if any.
682 		 */
683 		if ((ret = __log_flush_int(dblp, NULL, 0)) != 0)
684 			return (ret);
685 
686 		/*
687 		 * Save the last known offset from the previous file, we'll
688 		 * need it to initialize the persistent header information.
689 		 */
690 		lastoff = lp->lsn.offset;
691 
692 		/* Point the current LSN to the new file. */
693 		++lp->lsn.file;
694 		lp->lsn.offset = 0;
695 
696 		/* Reset the file write offset. */
697 		lp->w_off = 0;
698 	} else
699 		lastoff = 0;
700 
701 	/*
702 	 * Replication may require we reset the log file name space entirely.
703 	 * In that case we also force a file switch so that replication can
704 	 * clean up old files.
705 	 */
706 	if (logfile != 0) {
707 		lp->lsn.file = logfile;
708 		lp->lsn.offset = 0;
709 		lp->w_off = 0;
710 		if (lp->db_log_inmemory) {
711 			lsn = lp->lsn;
712 			(void)__log_zero(env, &lsn);
713 		} else {
714 			lp->s_lsn = lp->lsn;
715 			if ((ret = __log_newfh(dblp, 1)) != 0)
716 				return (ret);
717 		}
718 	}
719 
720 	DB_ASSERT(env, lp->db_log_inmemory || lp->b_off == 0);
721 	if (lp->db_log_inmemory &&
722 	    (ret = __log_inmem_newfile(dblp, lp->lsn.file)) != 0)
723 		return (ret);
724 
725 	/*
726 	 * Insert persistent information as the first record in every file.
727 	 * Note that the previous length is wrong for the very first record
728 	 * of the log, but that's okay, we check for it during retrieval.
729 	 */
730 	memset(&t, 0, sizeof(t));
731 	memset(&hdr, 0, sizeof(HDR));
732 
733 	need_free = 0;
734 	tsize = sizeof(LOGP);
735 	db_cipher = env->crypto_handle;
736 	if (CRYPTO_ON(env))
737 		tsize += db_cipher->adj_size(tsize);
738 	if ((ret = __os_calloc(env, 1, tsize, &tpersist)) != 0)
739 		return (ret);
740 	need_free = 1;
741 	/*
742 	 * If we're told what version to make this file, then we
743 	 * need to be at that version.  Update here.
744 	 */
745 	if (version != 0) {
746 		__log_set_version(env, version);
747 		if ((ret = __env_init_rec(env, version)) != 0)
748 			goto err;
749 	}
750 	lp->persist.log_size = lp->log_size = lp->log_nsize;
751 	memcpy(tpersist, &lp->persist, sizeof(LOGP));
752 	DB_SET_DBT(t, tpersist, tsize);
753 	if (LOG_SWAPPED(env))
754 		__log_persistswap(tpersist);
755 
756 	if ((ret =
757 	    __log_encrypt_record(env, &t, &hdr, (u_int32_t)sizeof(LOGP))) != 0)
758 		goto err;
759 
760 	if ((ret = __log_putr(dblp, &lsn,
761 	    &t, lastoff == 0 ? 0 : lastoff - lp->len, &hdr)) != 0)
762 		goto err;
763 
764 	/* Update the LSN information returned to the caller. */
765 	if (lsnp != NULL)
766 		*lsnp = lp->lsn;
767 
768 err:	if (need_free)
769 		__os_free(env, tpersist);
770 	return (ret);
771 }
772 
773 /*
774  * __log_putr --
775  *	Actually put a record into the log.
776  */
777 static int
__log_putr(dblp,lsn,dbt,prev,h)778 __log_putr(dblp, lsn, dbt, prev, h)
779 	DB_LOG *dblp;
780 	DB_LSN *lsn;
781 	const DBT *dbt;
782 	u_int32_t prev;
783 	HDR *h;
784 {
785 	DB_CIPHER *db_cipher;
786 	DB_LSN f_lsn;
787 	ENV *env;
788 	HDR tmp, *hdr;
789 	LOG *lp;
790 	int ret, t_ret;
791 	db_size_t b_off;
792 	size_t nr;
793 	u_int32_t w_off;
794 
795 	env = dblp->env;
796 	lp = dblp->reginfo.primary;
797 
798 	/*
799 	 * If we weren't given a header, use a local one.
800 	 */
801 	db_cipher = env->crypto_handle;
802 	if (h == NULL) {
803 		hdr = &tmp;
804 		memset(hdr, 0, sizeof(HDR));
805 		if (CRYPTO_ON(env))
806 			hdr->size = HDR_CRYPTO_SZ;
807 		else
808 			hdr->size = HDR_NORMAL_SZ;
809 	} else
810 		hdr = h;
811 
812 	/* Save our position in case we fail. */
813 	b_off = lp->b_off;
814 	w_off = lp->w_off;
815 	f_lsn = lp->f_lsn;
816 
817 	/*
818 	 * Initialize the header.  If we just switched files, lsn.offset will
819 	 * be 0, and what we really want is the offset of the previous record
820 	 * in the previous file.  Fortunately, prev holds the value we want.
821 	 */
822 	hdr->prev = prev;
823 	hdr->len = (u_int32_t)hdr->size + dbt->size;
824 
825 #ifdef HAVE_LOG_CHECKSUM
826 	/*
827 	 * If we were passed in a nonzero checksum, our caller calculated
828 	 * the checksum before acquiring the log mutex, as an optimization.
829 	 *
830 	 * If our caller calculated a real checksum of 0, we'll needlessly
831 	 * recalculate it.  C'est la vie;  there's no out-of-bounds value
832 	 * here.
833 	 */
834 	if (hdr->chksum[0] == 0) {
835 		if (lp->persist.version < DB_LOGCHKSUM)
836 			__db_chksum(NULL, dbt->data, dbt->size,
837 			    (CRYPTO_ON(env)) ? db_cipher->mac_key : NULL,
838 			    hdr->chksum);
839 		else
840 			__db_chksum(hdr, dbt->data, dbt->size,
841 			    (CRYPTO_ON(env)) ? db_cipher->mac_key : NULL,
842 			    hdr->chksum);
843 	} else if (lp->persist.version >= DB_LOGCHKSUM)
844 		/*
845 		 * We need to include hdr->prev and len here, since they were
846 		 * still zero at the time of the caller's __db_chksum() call.
847 		 */
848 		LOG_HDR_SUM(CRYPTO_ON(env), hdr, hdr->chksum);
849 #endif
850 
851 	if (lp->db_log_inmemory && (ret = __log_inmem_chkspace(dblp,
852 	    (u_int32_t)hdr->size + dbt->size)) != 0)
853 		goto err;
854 
855 	/*
856 	 * The offset into the log file at this point is the LSN where
857 	 * we're about to put this record, and is the LSN the caller wants.
858 	 */
859 	*lsn = lp->lsn;
860 
861 	nr = hdr->size;
862 	if (LOG_SWAPPED(env))
863 		__log_hdrswap(hdr, CRYPTO_ON(env));
864 
865 	 /* nr can't overflow a 32 bit value - header size is internal. */
866 	ret = __log_fill(dblp, lsn, hdr, (u_int32_t)nr);
867 
868 	if (LOG_SWAPPED(env))
869 		__log_hdrswap(hdr, CRYPTO_ON(env));
870 
871 	if (ret != 0)
872 		goto err;
873 
874 	if ((ret = __log_fill(dblp, lsn, dbt->data, dbt->size)) != 0)
875 		goto err;
876 
877 	lp->len = (u_int32_t)(hdr->size + dbt->size);
878 	lp->lsn.offset += lp->len;
879 	return (0);
880 err:
881 	/*
882 	 * If we wrote more than one buffer before failing, get the
883 	 * first one back.  The extra buffers will fail the checksums
884 	 * and be ignored.
885 	 */
886 	if (w_off + lp->buffer_size < lp->w_off) {
887 		DB_ASSERT(env, !lp->db_log_inmemory);
888 		if ((t_ret = __os_seek(env, dblp->lfhp, 0, 0, w_off)) != 0 ||
889 		    (t_ret = __os_read(env, dblp->lfhp, dblp->bufp,
890 		    b_off, &nr)) != 0)
891 			return (__env_panic(env, t_ret));
892 		if (nr != b_off) {
893 			__db_errx(env, DB_STR("2515",
894 			    "Short read while restoring log"));
895 			return (__env_panic(env, EIO));
896 		}
897 	}
898 
899 	/* Reset to where we started. */
900 	lp->w_off = w_off;
901 	lp->b_off = b_off;
902 	lp->f_lsn = f_lsn;
903 
904 	return (ret);
905 }
906 
907 /*
908  * __log_flush_pp --
909  *	ENV->log_flush pre/post processing.
910  *
911  * PUBLIC: int __log_flush_pp __P((DB_ENV *, const DB_LSN *));
912  */
913 int
__log_flush_pp(dbenv,lsn)914 __log_flush_pp(dbenv, lsn)
915 	DB_ENV *dbenv;
916 	const DB_LSN *lsn;
917 {
918 	DB_THREAD_INFO *ip;
919 	ENV *env;
920 	int ret;
921 
922 	env = dbenv->env;
923 
924 	ENV_REQUIRES_CONFIG(env,
925 	    env->lg_handle, "DB_ENV->log_flush", DB_INIT_LOG);
926 
927 	ENV_ENTER(env, ip);
928 	REPLICATION_WRAP(env, (__log_flush(env, lsn)), 0, ret);
929 	ENV_LEAVE(env, ip);
930 	return (ret);
931 }
932 
933 /*
934  * See if we need to wait.  s_lsn is not locked so some care is needed.
935  * The sync point can only move forward.  The lsnp->file cannot be
936  * greater than the s_lsn.file.  If the file we want is in the past
937  * we are done.  If the file numbers are the same check the offset.
938  * This all assumes we can read an 32-bit quantity in one state or
939  * the other, not in transition.
940  */
941 #define	ALREADY_FLUSHED(lp, lsnp)					\
942 	(((lp)->s_lsn.file > (lsnp)->file) ||				\
943 	((lp)->s_lsn.file == (lsnp)->file &&				\
944 	    (lp)->s_lsn.offset > (lsnp)->offset))
945 
946 /*
947  * __log_flush --
948  *	ENV->log_flush
949  *
950  * PUBLIC: int __log_flush __P((ENV *, const DB_LSN *));
951  */
952 int
__log_flush(env,lsn)953 __log_flush(env, lsn)
954 	ENV *env;
955 	const DB_LSN *lsn;
956 {
957 	DB_LOG *dblp;
958 	LOG *lp;
959 	int ret;
960 
961 	dblp = env->lg_handle;
962 	lp = dblp->reginfo.primary;
963 	if (lsn != NULL && ALREADY_FLUSHED(lp, lsn))
964 		return (0);
965 	LOG_SYSTEM_LOCK(env);
966 	ret = __log_flush_int(dblp, lsn, 1);
967 	LOG_SYSTEM_UNLOCK(env);
968 	return (ret);
969 }
970 
971 /*
972  * __log_flush_int --
973  *	Write all records less than or equal to the specified LSN; internal
974  *	version.
975  *
976  * PUBLIC: int __log_flush_int __P((DB_LOG *, const DB_LSN *, int));
977  */
978 int
__log_flush_int(dblp,lsnp,release)979 __log_flush_int(dblp, lsnp, release)
980 	DB_LOG *dblp;
981 	const DB_LSN *lsnp;
982 	int release;
983 {
984 	struct __db_commit *commit;
985 	ENV *env;
986 	DB_LSN flush_lsn, f_lsn;
987 	LOG *lp;
988 	size_t b_off;
989 	u_int32_t ncommit, w_off;
990 	int do_flush, first, ret;
991 
992 	env = dblp->env;
993 	lp = dblp->reginfo.primary;
994 	ncommit = 0;
995 	ret = 0;
996 
997 	if (lp->db_log_inmemory) {
998 		lp->s_lsn = lp->lsn;
999 		STAT(++lp->stat.st_scount);
1000 		return (0);
1001 	}
1002 
1003 	/*
1004 	 * If no LSN specified, flush the entire log by setting the flush LSN
1005 	 * to the last LSN written in the log.  Otherwise, check that the LSN
1006 	 * isn't a non-existent record for the log.
1007 	 */
1008 	if (lsnp == NULL) {
1009 		flush_lsn.file = lp->lsn.file;
1010 		flush_lsn.offset = lp->lsn.offset - lp->len;
1011 	} else if (lsnp->file > lp->lsn.file ||
1012 	    (lsnp->file == lp->lsn.file &&
1013 	    lsnp->offset > lp->lsn.offset - lp->len)) {
1014 		__db_errx(env, DB_STR_A("2516",
1015     "DB_ENV->log_flush: LSN of %lu/%lu past current end-of-log of %lu/%lu",
1016 		    "%lu %lu %lu %lu"), (u_long)lsnp->file,
1017 		    (u_long)lsnp->offset, (u_long)lp->lsn.file,
1018 		    (u_long)lp->lsn.offset);
1019 		__db_errx(env, DB_STR("2517",
1020 		    "Database environment corrupt; the wrong log files may "
1021 		    "have been removed or incompatible database files "
1022 		    "imported from another environment"));
1023 		return (__env_panic(env, DB_RUNRECOVERY));
1024 	} else {
1025 		if (ALREADY_FLUSHED(lp, lsnp))
1026 			return (0);
1027 		flush_lsn = *lsnp;
1028 	}
1029 
1030 	/*
1031 	 * If a flush is in progress and we're allowed to do so, drop
1032 	 * the region lock and block waiting for the next flush.
1033 	 */
1034 	if (release && lp->in_flush != 0) {
1035 		if ((commit = SH_TAILQ_FIRST(
1036 		    &lp->free_commits, __db_commit)) == NULL) {
1037 			if ((ret = __env_alloc(&dblp->reginfo,
1038 			    sizeof(struct __db_commit), &commit)) != 0)
1039 				goto flush;
1040 			memset(commit, 0, sizeof(*commit));
1041 			if ((ret = __mutex_alloc(env, MTX_TXN_COMMIT,
1042 			    DB_MUTEX_SELF_BLOCK, &commit->mtx_txnwait)) != 0) {
1043 				__env_alloc_free(&dblp->reginfo, commit);
1044 				return (ret);
1045 			}
1046 			MUTEX_LOCK_NO_CTR(env, commit->mtx_txnwait);
1047 		} else
1048 			SH_TAILQ_REMOVE(
1049 			    &lp->free_commits, commit, links, __db_commit);
1050 
1051 		lp->ncommit++;
1052 
1053 		/*
1054 		 * Flushes may be requested out of LSN order;  be
1055 		 * sure we only move lp->t_lsn forward.
1056 		 */
1057 		if (LOG_COMPARE(&lp->t_lsn, &flush_lsn) < 0)
1058 			lp->t_lsn = flush_lsn;
1059 
1060 		commit->lsn = flush_lsn;
1061 		SH_TAILQ_INSERT_HEAD(
1062 		    &lp->commits, commit, links, __db_commit);
1063 		LOG_SYSTEM_UNLOCK(env);
1064 		/* Wait here for the in-progress flush to finish. */
1065 		MUTEX_LOCK_NO_CTR(env, commit->mtx_txnwait);
1066 		LOG_SYSTEM_LOCK(env);
1067 
1068 		lp->ncommit--;
1069 		/*
1070 		 * Grab the flag before freeing the struct to see if
1071 		 * we need to flush the log to commit.  If so,
1072 		 * use the maximal lsn for any committing thread.
1073 		 */
1074 		do_flush = F_ISSET(commit, DB_COMMIT_FLUSH);
1075 		F_CLR(commit, DB_COMMIT_FLUSH);
1076 		SH_TAILQ_INSERT_HEAD(
1077 		    &lp->free_commits, commit, links, __db_commit);
1078 		if (do_flush) {
1079 			lp->in_flush--;
1080 			flush_lsn = lp->t_lsn;
1081 		} else
1082 			return (0);
1083 	}
1084 
1085 	/*
1086 	 * Protect flushing with its own mutex so we can release
1087 	 * the region lock except during file switches.
1088 	 */
1089 flush:	MUTEX_LOCK(env, lp->mtx_flush);
1090 
1091 	/*
1092 	 * If the LSN is less than or equal to the last-sync'd LSN, we're done.
1093 	 * Note, the last-sync LSN saved in s_lsn is the LSN of the first byte
1094 	 * after the byte we absolutely know was written to disk, so the test
1095 	 * is <, not <=.
1096 	 */
1097 	if (flush_lsn.file < lp->s_lsn.file ||
1098 	    (flush_lsn.file == lp->s_lsn.file &&
1099 	    flush_lsn.offset < lp->s_lsn.offset)) {
1100 		MUTEX_UNLOCK(env, lp->mtx_flush);
1101 		goto done;
1102 	}
1103 
1104 	/*
1105 	 * We may need to write the current buffer.  We have to write the
1106 	 * current buffer if the flush LSN is greater than or equal to the
1107 	 * buffer's starting LSN.
1108 	 *
1109 	 * Otherwise, it's still possible that this thread may never have
1110 	 * written to this log file.  Acquire a file descriptor if we don't
1111 	 * already have one.
1112 	 */
1113 	if (lp->b_off != 0 && LOG_COMPARE(&flush_lsn, &lp->f_lsn) >= 0) {
1114 		if ((ret = __log_write(dblp,
1115 		    dblp->bufp, (u_int32_t)lp->b_off)) != 0) {
1116 			MUTEX_UNLOCK(env, lp->mtx_flush);
1117 			goto done;
1118 		}
1119 
1120 		lp->b_off = 0;
1121 	} else if (dblp->lfhp == NULL || dblp->lfname != lp->lsn.file)
1122 		if ((ret = __log_newfh(dblp, 0)) != 0) {
1123 			MUTEX_UNLOCK(env, lp->mtx_flush);
1124 			goto done;
1125 		}
1126 
1127 	/*
1128 	 * We are going to flush, release the region.
1129 	 * First get the current state of the buffer since
1130 	 * another write may come in, but we may not flush it.
1131 	 */
1132 	b_off = lp->b_off;
1133 	w_off = lp->w_off;
1134 	f_lsn = lp->f_lsn;
1135 	lp->in_flush++;
1136 	if (release)
1137 		LOG_SYSTEM_UNLOCK(env);
1138 
1139 	/* Sync all writes to disk. */
1140 	if (!lp->nosync) {
1141 		if ((ret = __os_fsync(env, dblp->lfhp)) != 0) {
1142 			MUTEX_UNLOCK(env, lp->mtx_flush);
1143 			if (release)
1144 				LOG_SYSTEM_LOCK(env);
1145 			lp->in_flush--;
1146 			goto done;
1147 		}
1148 		STAT(++lp->stat.st_scount);
1149 	}
1150 
1151 	/*
1152 	 * Set the last-synced LSN.
1153 	 * This value must be set to the LSN past the last complete
1154 	 * record that has been flushed.  This is at least the first
1155 	 * lsn, f_lsn.  If the buffer is empty, b_off == 0, then
1156 	 * we can move up to write point since the first lsn is not
1157 	 * set for the new buffer.
1158 	 */
1159 	lp->s_lsn = f_lsn;
1160 	if (b_off == 0)
1161 		lp->s_lsn.offset = w_off;
1162 
1163 	MUTEX_UNLOCK(env, lp->mtx_flush);
1164 	if (release)
1165 		LOG_SYSTEM_LOCK(env);
1166 
1167 	lp->in_flush--;
1168 
1169 	/*
1170 	 * How many flush calls (usually commits) did this call actually sync?
1171 	 * At least one, if it got here.
1172 	 */
1173 	ncommit = 1;
1174 done:
1175 	if (lp->ncommit != 0) {
1176 		first = 1;
1177 		SH_TAILQ_FOREACH(commit, &lp->commits, links, __db_commit)
1178 			if (LOG_COMPARE(&lp->s_lsn, &commit->lsn) > 0) {
1179 				MUTEX_UNLOCK_NO_CTR(env, commit->mtx_txnwait);
1180 				SH_TAILQ_REMOVE(
1181 				    &lp->commits, commit, links, __db_commit);
1182 				ncommit++;
1183 			} else if (first == 1) {
1184 				F_SET(commit, DB_COMMIT_FLUSH);
1185 				MUTEX_UNLOCK_NO_CTR(env, commit->mtx_txnwait);
1186 				SH_TAILQ_REMOVE(
1187 				    &lp->commits, commit, links, __db_commit);
1188 				/*
1189 				 * This thread will wake and flush.
1190 				 * If another thread commits and flushes
1191 				 * first we will waste a trip trough the
1192 				 * mutex.
1193 				 */
1194 				lp->in_flush++;
1195 				first = 0;
1196 			}
1197 	}
1198 #ifdef HAVE_STATISTICS
1199 	if (lp->stat.st_maxcommitperflush < ncommit)
1200 		lp->stat.st_maxcommitperflush = ncommit;
1201 	if (lp->stat.st_mincommitperflush > ncommit ||
1202 	    lp->stat.st_mincommitperflush == 0)
1203 		lp->stat.st_mincommitperflush = ncommit;
1204 #endif
1205 
1206 	return (ret);
1207 }
1208 
1209 /*
1210  * __log_fill --
1211  *	Write information into the log.
1212  */
1213 static int
__log_fill(dblp,lsn,addr,len)1214 __log_fill(dblp, lsn, addr, len)
1215 	DB_LOG *dblp;
1216 	DB_LSN *lsn;
1217 	void *addr;
1218 	u_int32_t len;
1219 {
1220 	LOG *lp;
1221 	u_int32_t bsize, nrec;
1222 	size_t nw, remain;
1223 	int ret;
1224 
1225 	lp = dblp->reginfo.primary;
1226 	bsize = lp->buffer_size;
1227 
1228 	if (lp->db_log_inmemory) {
1229 		__log_inmem_copyin(dblp, lp->b_off, addr, len);
1230 		lp->b_off = (lp->b_off + len) % lp->buffer_size;
1231 		return (0);
1232 	}
1233 
1234 	while (len > 0) {			/* Copy out the data. */
1235 		/*
1236 		 * If we're beginning a new buffer, note the user LSN to which
1237 		 * the first byte of the buffer belongs.  We have to know this
1238 		 * when flushing the buffer so that we know if the in-memory
1239 		 * buffer needs to be flushed.
1240 		 */
1241 		if (lp->b_off == 0)
1242 			lp->f_lsn = *lsn;
1243 
1244 		/*
1245 		 * If we're on a buffer boundary and the data is big enough,
1246 		 * copy as many records as we can directly from the data.
1247 		 */
1248 		if (lp->b_off == 0 && len >= bsize) {
1249 			nrec = len / bsize;
1250 			if ((ret = __log_write(dblp, addr, nrec * bsize)) != 0)
1251 				return (ret);
1252 			addr = (u_int8_t *)addr + nrec * bsize;
1253 			len -= nrec * bsize;
1254 			STAT(++lp->stat.st_wcount_fill);
1255 			continue;
1256 		}
1257 
1258 		/* Figure out how many bytes we can copy this time. */
1259 		remain = bsize - lp->b_off;
1260 		nw = remain > len ? len : remain;
1261 		memcpy(dblp->bufp + lp->b_off, addr, nw);
1262 		addr = (u_int8_t *)addr + nw;
1263 		len -= (u_int32_t)nw;
1264 		lp->b_off += (u_int32_t)nw;
1265 
1266 		/* If we fill the buffer, flush it. */
1267 		if (lp->b_off == bsize) {
1268 			if ((ret = __log_write(dblp, dblp->bufp, bsize)) != 0)
1269 				return (ret);
1270 			lp->b_off = 0;
1271 			STAT(++lp->stat.st_wcount_fill);
1272 		}
1273 	}
1274 	return (0);
1275 }
1276 
1277 /*
1278  * __log_write --
1279  *	Write the log buffer to disk.
1280  */
1281 static int
__log_write(dblp,addr,len)1282 __log_write(dblp, addr, len)
1283 	DB_LOG *dblp;
1284 	void *addr;
1285 	u_int32_t len;
1286 {
1287 	ENV *env;
1288 	LOG *lp;
1289 	size_t nw;
1290 	int ret;
1291 
1292 	env = dblp->env;
1293 	lp = dblp->reginfo.primary;
1294 
1295 	DB_ASSERT(env, !lp->db_log_inmemory);
1296 
1297 	/*
1298 	 * If we haven't opened the log file yet or the current one has
1299 	 * changed, acquire a new log file.  We are creating the file if we're
1300 	 * about to write to the start of it, in other words, if the write
1301 	 * offset is zero.
1302 	 */
1303 	if (dblp->lfhp == NULL || dblp->lfname != lp->lsn.file ||
1304 	    dblp->lf_timestamp != lp->timestamp)
1305 		if ((ret = __log_newfh(dblp, lp->w_off == 0)) != 0)
1306 			return (ret);
1307 
1308 	/*
1309 	 * If we're writing the first block in a log file on a filesystem that
1310 	 * guarantees unwritten blocks are zero-filled, we set the size of the
1311 	 * file in advance.  This increases sync performance on some systems,
1312 	 * because they don't need to update metadata on every sync.
1313 	 *
1314 	 * Ignore any error -- we may have run out of disk space, but that's no
1315 	 * reason to quit.
1316 	 */
1317 #ifdef HAVE_FILESYSTEM_NOTZERO
1318 	if (lp->w_off == 0 && !__os_fs_notzero()) {
1319 #else
1320 	if (lp->w_off == 0) {
1321 #endif
1322 		(void)__db_file_extend(env, dblp->lfhp, lp->log_size);
1323 		if (F_ISSET(dblp, DBLOG_ZERO))
1324 			(void)__db_zero_extend(env, dblp->lfhp,
1325 			     0, lp->log_size/lp->buffer_size, lp->buffer_size);
1326 
1327 	}
1328 
1329 	/*
1330 	 * Seek to the offset in the file (someone may have written it
1331 	 * since we last did).
1332 	 */
1333 	if ((ret = __os_io(env, DB_IO_WRITE,
1334 	    dblp->lfhp, 0, 0, lp->w_off, len, addr, &nw)) != 0)
1335 		return (ret);
1336 
1337 	/* Reset the buffer offset and update the seek offset. */
1338 	lp->w_off += len;
1339 
1340 	/* Update written statistics. */
1341 	if ((lp->stat.st_wc_bytes += len) >= MEGABYTE) {
1342 		lp->stat.st_wc_bytes -= MEGABYTE;
1343 		++lp->stat.st_wc_mbytes;
1344 	}
1345 #ifdef HAVE_STATISTICS
1346 	if ((lp->stat.st_w_bytes += len) >= MEGABYTE) {
1347 		lp->stat.st_w_bytes -= MEGABYTE;
1348 		++lp->stat.st_w_mbytes;
1349 	}
1350 	++lp->stat.st_wcount;
1351 #endif
1352 
1353 	return (0);
1354 }
1355 
1356 /*
1357  * __log_file_pp --
1358  *	ENV->log_file pre/post processing.
1359  *
1360  * PUBLIC: int __log_file_pp __P((DB_ENV *, const DB_LSN *, char *, size_t));
1361  */
1362 int
__log_file_pp(dbenv,lsn,namep,len)1363 __log_file_pp(dbenv, lsn, namep, len)
1364 	DB_ENV *dbenv;
1365 	const DB_LSN *lsn;
1366 	char *namep;
1367 	size_t len;
1368 {
1369 	DB_THREAD_INFO *ip;
1370 	ENV *env;
1371 	int ret, set;
1372 
1373 	env = dbenv->env;
1374 
1375 	ENV_REQUIRES_CONFIG(env,
1376 	    env->lg_handle, "DB_ENV->log_file", DB_INIT_LOG);
1377 
1378 	if ((ret = __log_get_config(dbenv, DB_LOG_IN_MEMORY, &set)) != 0)
1379 		return (ret);
1380 	if (set) {
1381 		__db_errx(env, DB_STR("2518",
1382 		    "DB_ENV->log_file is illegal with in-memory logs"));
1383 		return (EINVAL);
1384 	}
1385 
1386 	ENV_ENTER(env, ip);
1387 	REPLICATION_WRAP(env, (__log_file(env, lsn, namep, len)), 0, ret);
1388 	ENV_LEAVE(env, ip);
1389 	return (ret);
1390 }
1391 
1392 /*
1393  * __log_file --
1394  *	ENV->log_file.
1395  */
1396 static int
__log_file(env,lsn,namep,len)1397 __log_file(env, lsn, namep, len)
1398 	ENV *env;
1399 	const DB_LSN *lsn;
1400 	char *namep;
1401 	size_t len;
1402 {
1403 	DB_LOG *dblp;
1404 	int ret;
1405 	char *name;
1406 
1407 	dblp = env->lg_handle;
1408 	LOG_SYSTEM_LOCK(env);
1409 	ret = __log_name(dblp, lsn->file, &name, NULL, 0);
1410 	LOG_SYSTEM_UNLOCK(env);
1411 	if (ret != 0)
1412 		return (ret);
1413 
1414 	/* Check to make sure there's enough room and copy the name. */
1415 	if (len < strlen(name) + 1) {
1416 		*namep = '\0';
1417 		__db_errx(env, DB_STR("2519",
1418 		    "DB_ENV->log_file: name buffer is too short"));
1419 		return (EINVAL);
1420 	}
1421 	(void)strcpy(namep, name);
1422 	__os_free(env, name);
1423 
1424 	return (0);
1425 }
1426 
1427 /*
1428  * __log_newfh --
1429  *	Acquire a file handle for the current log file.
1430  */
1431 static int
__log_newfh(dblp,create)1432 __log_newfh(dblp, create)
1433 	DB_LOG *dblp;
1434 	int create;
1435 {
1436 	ENV *env;
1437 	LOG *lp;
1438 	u_int32_t flags;
1439 	int ret;
1440 	logfile_validity status;
1441 
1442 	env = dblp->env;
1443 	lp = dblp->reginfo.primary;
1444 
1445 	/* Close any previous file descriptor. */
1446 	if (dblp->lfhp != NULL) {
1447 		(void)__os_closehandle(env, dblp->lfhp);
1448 		dblp->lfhp = NULL;
1449 	}
1450 
1451 	flags = DB_OSO_SEQ |
1452 	    (create ? DB_OSO_CREATE : 0) |
1453 	    (F_ISSET(dblp, DBLOG_DIRECT) ? DB_OSO_DIRECT : 0) |
1454 	    (F_ISSET(dblp, DBLOG_DSYNC) ? DB_OSO_DSYNC : 0);
1455 
1456 	/* Get the path of the new file and open it. */
1457 	dblp->lfname = lp->lsn.file;
1458 	if ((ret = __log_valid(dblp, dblp->lfname, 0, &dblp->lfhp,
1459 	    flags, &status, NULL)) != 0)
1460 		__db_err(env, ret,
1461 		    "DB_ENV->log_newfh: %lu", (u_long)lp->lsn.file);
1462 	else if (status != DB_LV_NORMAL && status != DB_LV_INCOMPLETE &&
1463 	    status != DB_LV_OLD_READABLE)
1464 		ret = USR_ERR(env, DB_NOTFOUND);
1465 
1466 	return (ret);
1467 }
1468 
1469 /*
1470  * __log_name --
1471  *	Return the log name for a particular file, and optionally open it.
1472  *
1473  * PUBLIC: int __log_name __P((DB_LOG *,
1474  * PUBLIC:     u_int32_t, char **, DB_FH **, u_int32_t));
1475  */
1476 int
__log_name(dblp,filenumber,namep,fhpp,flags)1477 __log_name(dblp, filenumber, namep, fhpp, flags)
1478 	DB_LOG *dblp;
1479 	u_int32_t filenumber, flags;
1480 	char **namep;
1481 	DB_FH **fhpp;
1482 {
1483 	ENV *env;
1484 	LOG *lp;
1485 	int mode, ret;
1486 	char *oname;
1487 	char old[sizeof(LFPREFIX) + 5 + 20], new[sizeof(LFPREFIX) + 10 + 20];
1488 
1489 	env = dblp->env;
1490 	lp = dblp->reginfo.primary;
1491 
1492 	DB_ASSERT(env, !lp->db_log_inmemory);
1493 
1494 	/*
1495 	 * !!!
1496 	 * The semantics of this routine are bizarre.
1497 	 *
1498 	 * The reason for all of this is that we need a place where we can
1499 	 * intercept requests for log files, and, if appropriate, check for
1500 	 * both the old-style and new-style log file names.  The trick is
1501 	 * that all callers of this routine that are opening the log file
1502 	 * read-only want to use an old-style file name if they can't find
1503 	 * a match using a new-style name.  The only down-side is that some
1504 	 * callers may check for the old-style when they really don't need
1505 	 * to, but that shouldn't mess up anything, and we only check for
1506 	 * the old-style name when we've already failed to find a new-style
1507 	 * one.
1508 	 *
1509 	 * Create a new-style file name, and if we're not going to open the
1510 	 * file, return regardless.
1511 	 */
1512 	(void)snprintf(new, sizeof(new), LFNAME, filenumber);
1513 	if ((ret = __db_appname(env,
1514 	    DB_APP_LOG, new, NULL, namep)) != 0 || fhpp == NULL)
1515 		return (ret);
1516 
1517 	/* The application may have specified an absolute file mode. */
1518 	if (lp->filemode == 0)
1519 		mode = env->db_mode;
1520 	else {
1521 		LF_SET(DB_OSO_ABSMODE);
1522 		mode = lp->filemode;
1523 	}
1524 
1525 	/* Open the new-style file -- if we succeed, we're done. */
1526 	dblp->lf_timestamp = lp->timestamp;
1527 	if ((ret = __os_open(env, *namep, 0, flags, mode, fhpp)) == 0)
1528 		return (0);
1529 
1530 	/*
1531 	 * If the open failed for reason other than the file
1532 	 * not being there, complain loudly, the wrong user
1533 	 * probably started up the application.
1534 	 */
1535 	if (ret != ENOENT) {
1536 		__db_err(env, ret, DB_STR_A("2520",
1537 		    "%s: log file unreadable", "%s"), *namep);
1538 		return (__env_panic(env, ret));
1539 	}
1540 
1541 	/*
1542 	 * The open failed... if the DB_RDONLY flag isn't set, we're done,
1543 	 * the caller isn't interested in old-style files.
1544 	 */
1545 	if (!LF_ISSET(DB_OSO_RDONLY)) {
1546 		__db_err(env, ret, DB_STR_A("2521",
1547 		    "%s: log file open failed", "%s"), *namep);
1548 		return (__env_panic(env, ret));
1549 	}
1550 
1551 	/* Create an old-style file name. */
1552 	(void)snprintf(old, sizeof(old), LFNAME_V1, filenumber);
1553 	if ((ret = __db_appname(env,
1554 	    DB_APP_LOG, old, NULL, &oname)) != 0)
1555 		goto err;
1556 
1557 	/*
1558 	 * Open the old-style file -- if we succeed, we're done.  Free the
1559 	 * space allocated for the new-style name and return the old-style
1560 	 * name to the caller.
1561 	 */
1562 	if ((ret = __os_open(env, oname, 0, flags, mode, fhpp)) == 0) {
1563 		__os_free(env, *namep);
1564 		*namep = oname;
1565 		return (0);
1566 	}
1567 
1568 	/*
1569 	 * Couldn't find either style of name -- return the new-style name
1570 	 * for the caller's error message.  If it's an old-style name that's
1571 	 * actually missing we're going to confuse the user with the error
1572 	 * message, but that implies that not only were we looking for an
1573 	 * old-style name, but we expected it to exist and we weren't just
1574 	 * looking for any log file.  That's not a likely error.
1575 	 */
1576 err:	__os_free(env, oname);
1577 	return (ret);
1578 }
1579 
1580 /*
1581  * __log_rep_put --
1582  *	Short-circuit way for replication clients to put records into the
1583  * log.  Replication clients' logs need to be laid out exactly as their masters'
1584  * are, so we let replication take responsibility for when the log gets
1585  * flushed, when log switches files, etc.  This is just a thin PUBLIC wrapper
1586  * for __log_putr with a slightly prettier interface.
1587  *
1588  * Note that the REP->mtx_clientdb should be held when this is called.
1589  * Note that we acquire the log region mutex while holding mtx_clientdb.
1590  *
1591  * PUBLIC: int __log_rep_put __P((ENV *, DB_LSN *, const DBT *, u_int32_t));
1592  */
1593 int
__log_rep_put(env,lsnp,rec,flags)1594 __log_rep_put(env, lsnp, rec, flags)
1595 	ENV *env;
1596 	DB_LSN *lsnp;
1597 	const DBT *rec;
1598 	u_int32_t flags;
1599 {
1600 	DBT *dbt, t;
1601 	DB_CIPHER *db_cipher;
1602 	DB_LOG *dblp;
1603 	HDR hdr;
1604 	LOG *lp;
1605 	int need_free, ret;
1606 
1607 	dblp = env->lg_handle;
1608 	lp = dblp->reginfo.primary;
1609 
1610 	LOG_SYSTEM_LOCK(env);
1611 	memset(&hdr, 0, sizeof(HDR));
1612 	t = *rec;
1613 	dbt = &t;
1614 	need_free = 0;
1615 	db_cipher = env->crypto_handle;
1616 	if (CRYPTO_ON(env))
1617 		t.size += db_cipher->adj_size(rec->size);
1618 	if ((ret = __os_calloc(env, 1, t.size, &t.data)) != 0)
1619 		goto err;
1620 	need_free = 1;
1621 	memcpy(t.data, rec->data, rec->size);
1622 
1623 	if ((ret = __log_encrypt_record(env, dbt, &hdr, rec->size)) != 0)
1624 		goto err;
1625 
1626 	DB_ASSERT(env, LOG_COMPARE(lsnp, &lp->lsn) == 0);
1627 	ret = __log_putr(dblp, lsnp, dbt, lp->lsn.offset - lp->len, &hdr);
1628 err:
1629 	/*
1630 	 * !!! Assume caller holds REP->mtx_clientdb to modify ready_lsn.
1631 	 */
1632 	lp->ready_lsn = lp->lsn;
1633 
1634 	if (LF_ISSET(DB_LOG_CHKPNT))
1635 		lp->stat.st_wc_bytes = lp->stat.st_wc_mbytes = 0;
1636 
1637 	/* Increment count of records added to the log. */
1638 	STAT(++lp->stat.st_record);
1639 	LOG_SYSTEM_UNLOCK(env);
1640 	if (need_free)
1641 		__os_free(env, t.data);
1642 	return (ret);
1643 }
1644 
1645 /*
1646  * __log_rep_write --
1647  *	Way for replication clients to write the log buffer for the
1648  * DB_TXN_WRITE_NOSYNC option.  This is just a thin PUBLIC wrapper
1649  * for __log_write that is similar to __log_flush_commit.
1650  *
1651  * Note that the REP->mtx_clientdb should be held when this is called.
1652  * Note that we acquire the log region mutex while holding mtx_clientdb.
1653  *
1654  * PUBLIC: int __log_rep_write __P((ENV *));
1655  */
1656 int
__log_rep_write(env)1657 __log_rep_write(env)
1658 	ENV *env;
1659 {
1660 	DB_LOG *dblp;
1661 	LOG *lp;
1662 	int ret;
1663 
1664 	dblp = env->lg_handle;
1665 	lp = dblp->reginfo.primary;
1666 	ret = 0;
1667 	LOG_SYSTEM_LOCK(env);
1668 	if (!lp->db_log_inmemory && lp->b_off != 0)
1669 		if ((ret = __log_write(dblp, dblp->bufp,
1670 		    (u_int32_t)lp->b_off)) == 0)
1671 			lp->b_off = 0;
1672 	LOG_SYSTEM_UNLOCK(env);
1673 	return (ret);
1674 }
1675 
1676 static int
__log_encrypt_record(env,dbt,hdr,orig)1677 __log_encrypt_record(env, dbt, hdr, orig)
1678 	ENV *env;
1679 	DBT *dbt;
1680 	HDR *hdr;
1681 	u_int32_t orig;
1682 {
1683 	DB_CIPHER *db_cipher;
1684 	int ret;
1685 
1686 	if (CRYPTO_ON(env)) {
1687 		db_cipher = env->crypto_handle;
1688 		hdr->size = HDR_CRYPTO_SZ;
1689 		hdr->orig_size = orig;
1690 		if ((ret = db_cipher->encrypt(env, db_cipher->data,
1691 		    hdr->iv, dbt->data, dbt->size)) != 0)
1692 			return (ret);
1693 	} else {
1694 		hdr->size = HDR_NORMAL_SZ;
1695 	}
1696 	return (0);
1697 }
1698 /*
1699  * __log_put_record_pp --
1700  *	DB_ENV->log_put_record pre/post processing.
1701  *
1702  * PUBLIC: int __log_put_record_pp __P((DB_ENV *, DB *, DB_TXN *, DB_LSN *,
1703  * PUBLIC:     u_int32_t, u_int32_t, u_int32_t, u_int32_t,
1704  * PUBLIC:     DB_LOG_RECSPEC *, ...));
1705  */
1706 int
__log_put_record_pp(DB_ENV * dbenv,DB * dbp,DB_TXN * txnp,DB_LSN * ret_lsnp,u_int32_t flags,u_int32_t rectype,u_int32_t has_data,u_int32_t size,DB_LOG_RECSPEC * spec,...)1707 __log_put_record_pp(DB_ENV *dbenv, DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp,
1708     u_int32_t flags, u_int32_t rectype, u_int32_t has_data, u_int32_t size,
1709     DB_LOG_RECSPEC *spec, ...)
1710 {
1711 	DB_THREAD_INFO *ip;
1712 	ENV *env;
1713 	va_list argp;
1714 	int ret;
1715 
1716 	env = dbenv->env;
1717 
1718 	ENV_REQUIRES_CONFIG(env,
1719 	    env->lg_handle, "DB_ENV->log_put_record", DB_INIT_LOG);
1720 
1721 	/* Validate arguments: check for allowed flags. */
1722 	if ((ret = __db_fchk(env, "DB_ENV->log_put_record", flags,
1723 	    DB_LOG_CHKPNT | DB_LOG_COMMIT |
1724 	    DB_FLUSH | DB_LOG_NOCOPY | DB_LOG_WRNOSYNC)) != 0)
1725 		return (ret);
1726 
1727 	/* DB_LOG_WRNOSYNC and DB_FLUSH are mutually exclusive. */
1728 	if (LF_ISSET(DB_LOG_WRNOSYNC) && LF_ISSET(DB_FLUSH))
1729 		return (__db_ferr(env, "DB_ENV->log_put_record", 1));
1730 
1731 	/* Replication clients should never write log records. */
1732 	if (IS_REP_CLIENT(env)) {
1733 		__db_errx(env, DB_STR("2511",
1734 		    "DB_ENV->log_put is illegal on replication clients"));
1735 		return (EINVAL);
1736 	}
1737 
1738 	ENV_ENTER(env, ip);
1739 	va_start(argp, spec);
1740 	REPLICATION_WRAP(env, (__log_put_record_int(env, dbp,
1741 	    txnp, ret_lsnp, flags, rectype, has_data, size, spec, argp)),
1742 	    0, ret);
1743 	va_end(argp);
1744 	ENV_LEAVE(env, ip);
1745 	return (ret);
1746 }
1747 
1748 /*
1749  * PUBLIC: int __log_put_record __P((ENV *, DB *, DB_TXN *, DB_LSN *,
1750  * PUBLIC:     u_int32_t, u_int32_t, u_int32_t, u_int32_t,
1751  * PUBLIC:     DB_LOG_RECSPEC *, ...));
1752  */
1753 int
__log_put_record(ENV * env,DB * dbp,DB_TXN * txnp,DB_LSN * ret_lsnp,u_int32_t flags,u_int32_t rectype,u_int32_t has_data,u_int32_t size,DB_LOG_RECSPEC * spec,...)1754 __log_put_record(ENV *env, DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp,
1755     u_int32_t flags, u_int32_t rectype, u_int32_t has_data, u_int32_t size,
1756     DB_LOG_RECSPEC *spec, ...)
1757 {
1758 	va_list argp;
1759 	int ret;
1760 
1761 	va_start(argp, spec);
1762 	ret = __log_put_record_int(env, dbp, txnp, ret_lsnp, flags,
1763 	    rectype, has_data, size, spec, argp);
1764 	va_end(argp);
1765 	return (ret);
1766 }
1767 
1768 static int
__log_put_record_int(ENV * env,DB * dbp,DB_TXN * txnp,DB_LSN * ret_lsnp,u_int32_t flags,u_int32_t rectype,u_int32_t has_data,u_int32_t size,DB_LOG_RECSPEC * spec,va_list argp)1769 __log_put_record_int(ENV *env, DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp,
1770     u_int32_t flags, u_int32_t rectype, u_int32_t has_data, u_int32_t size,
1771     DB_LOG_RECSPEC *spec, va_list argp)
1772 {
1773 	DBT *data, *dbt, *header, logrec;
1774 	DB_LOG_RECSPEC *sp;
1775 	DB_LSN *lsnp, lsn, null_lsn, *pagelsn, *rlsnp;
1776 	DB_TXNLOGREC *lr;
1777 	LOG *lp;
1778 	PAGE *pghdrstart;
1779 	u_int64_t ulltmp;
1780 	u_int32_t hdrsize, op, zero, uinttmp, txn_num;
1781 	u_int npad;
1782 	u_int8_t *bp;
1783 	int is_durable, ret;
1784 	void *hdrstart;
1785 
1786 	COMPQUIET(lr, NULL);
1787 	COMPQUIET(hdrsize, 0);
1788 	COMPQUIET(op, 0);
1789 	COMPQUIET(hdrstart, NULL);
1790 	COMPQUIET(pghdrstart, NULL);
1791 	COMPQUIET(header, NULL);
1792 
1793 	/*
1794 	 * rlsnp will be stored into while holding the log system lock.
1795 	 * If this is a commit record then ret_lsnp will be the address of
1796 	 * the transaction detail visible_lsn field.  If not then this
1797 	 * may be the lsn of a page and we do not want to set it if
1798 	 * the log_put fails after writing the record (due to an I/O error).
1799 	 */
1800 	if (LF_ISSET(DB_LOG_COMMIT))
1801 		rlsnp = ret_lsnp;
1802 	else
1803 		rlsnp = &lsn;
1804 	npad = 0;
1805 	ret = 0;
1806 	data = NULL;
1807 
1808 	if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
1809 	    (dbp != NULL && F_ISSET(dbp, DB_AM_NOT_DURABLE))) {
1810 		if (txnp == NULL)
1811 			return (0);
1812 		is_durable = 0;
1813 	} else
1814 		is_durable = 1;
1815 
1816 	if (txnp == NULL) {
1817 		txn_num = 0;
1818 		lsnp = &null_lsn;
1819 		null_lsn.file = null_lsn.offset = 0;
1820 	} else {
1821 		if (TAILQ_FIRST(&txnp->kids) != NULL &&
1822 		    (ret = __txn_activekids(env, rectype, txnp)) != 0)
1823 			return (ret);
1824 		/*
1825 		 * We need to assign begin_lsn while holding region mutex.
1826 		 * That assignment is done inside the __log_put call,
1827 		 * so pass in the appropriate memory location to be filled
1828 		 * in by the log_put code.
1829 		 */
1830 		DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
1831 		txn_num = txnp->txnid;
1832 	}
1833 
1834 	if (dbp != NULL) {
1835 		DB_ASSERT(env, dbp->log_filename != NULL);
1836 		if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
1837 		    (ret = __dbreg_lazy_id(dbp)) != 0)
1838 			return (ret);
1839 	}
1840 
1841 	logrec.size = size;
1842 
1843 	if (CRYPTO_ON(env)) {
1844 		npad = env->crypto_handle->adj_size(logrec.size);
1845 		logrec.size += npad;
1846 	}
1847 
1848 	if (is_durable || txnp == NULL) {
1849 		if ((ret = __os_malloc(env, logrec.size, &logrec.data)) != 0)
1850 			return (ret);
1851 	} else {
1852 		if ((ret = __os_malloc(env,
1853 		    logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
1854 			return (ret);
1855 #ifdef DIAGNOSTIC
1856 		if ((ret =
1857 		    __os_malloc(env, logrec.size, &logrec.data)) != 0) {
1858 			__os_free(env, lr);
1859 			return (ret);
1860 		}
1861 #else
1862 		logrec.data = lr->data;
1863 #endif
1864 	}
1865 	if (npad > 0)
1866 		memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
1867 
1868 	bp = logrec.data;
1869 
1870 	LOGCOPY_32(env, bp, &rectype);
1871 	bp += sizeof(rectype);
1872 
1873 	LOGCOPY_32(env, bp, &txn_num);
1874 	bp += sizeof(txn_num);
1875 
1876 	LOGCOPY_FROMLSN(env, bp, lsnp);
1877 	bp += sizeof(DB_LSN);
1878 
1879 	zero = 0;
1880 	lp = env->lg_handle->reginfo.primary;
1881 	for (sp = spec; sp->type != LOGREC_Done; sp++) {
1882 		switch (sp->type) {
1883 		case LOGREC_DB:
1884 			/* This is not in the varargs. */
1885 			uinttmp = (u_int32_t)dbp->log_filename->id;
1886 			LOGCOPY_32(env, bp, &uinttmp);
1887 			bp += sizeof(uinttmp);
1888 			break;
1889 
1890 		case LOGREC_ARG:
1891 		case LOGREC_TIME:
1892 		case LOGREC_DBOP:
1893 			uinttmp = va_arg(argp, u_int32_t);
1894 			LOGCOPY_32(env, bp, &uinttmp);
1895 			bp += sizeof(uinttmp);
1896 			break;
1897 		case LOGREC_LONGARG:
1898 			ulltmp = va_arg(argp, u_int64_t);
1899 			LOGCOPY_64(env, bp, &ulltmp);
1900 			bp += sizeof(ulltmp);
1901 			break;
1902 		case LOGREC_OP:
1903 			op = va_arg(argp, u_int32_t);
1904 			LOGCOPY_32(env, bp, &op);
1905 			bp += sizeof(op);
1906 			break;
1907 		case LOGREC_DBT:
1908 		case LOGREC_PGLIST:
1909 		case LOGREC_LOCKS:
1910 		case LOGREC_HDR:
1911 		case LOGREC_DATA:
1912 			dbt = va_arg(argp, DBT *);
1913 			if (dbt == NULL) {
1914 				LOGCOPY_32(env, bp, &zero);
1915 				bp += sizeof(u_int32_t);
1916 			} else {
1917 				LOGCOPY_32(env, bp, &dbt->size);
1918 				bp += sizeof(dbt->size);
1919 				memcpy(bp, dbt->data, dbt->size);
1920 			}
1921 			/* Process fields that need to be byte swapped. */
1922 			if (dbp != NULL && F_ISSET(dbp, DB_AM_SWAP)) {
1923 				if (sp->type == LOGREC_HDR &&
1924 				    dbt != NULL && has_data == 0)
1925 					__db_recordswap(op,
1926 					    dbt->size, bp, NULL, 0);
1927 				else if (sp->type == LOGREC_HDR) {
1928 					hdrstart = bp;
1929 					hdrsize = dbt == NULL ? 0 : dbt->size;
1930 				} else if (sp->type == LOGREC_DATA) {
1931 					__db_recordswap(op,
1932 					    hdrsize, hdrstart, bp, 0);
1933 					has_data = 0;
1934 				}
1935 			}
1936 			if (dbt != NULL)
1937 				bp += dbt->size;
1938 
1939 			break;
1940 		/*
1941 		 * Page header and data -- we assume that the header
1942 		 * is listed first and the data follows sometime later.
1943 		 * There should be only one header/data pair per record.
1944 		 */
1945 		case LOGREC_PGDBT:
1946 			header = va_arg(argp, DBT *);
1947 			if (header == NULL) {
1948 				LOGCOPY_32(env, bp, &zero);
1949 				bp += sizeof(u_int32_t);
1950 			} else {
1951 				LOGCOPY_32(env, bp, &header->size);
1952 				bp += sizeof(header->size);
1953 				pghdrstart = (PAGE *)bp;
1954 				memcpy(bp, header->data, header->size);
1955 				if (has_data == 0 &&
1956 				    F_ISSET(dbp, DB_AM_SWAP) &&
1957 				    (ret = __db_pageswap(
1958 				     env, dbp, pghdrstart, (size_t)header->size,
1959 				     NULL, 0)) != 0)
1960 					return (ret);
1961 				bp += header->size;
1962 			}
1963 			break;
1964 
1965 		case LOGREC_PGDDBT:
1966 			data = va_arg(argp, DBT *);
1967 			if (data == NULL) {
1968 				zero = 0;
1969 				LOGCOPY_32(env, bp, &zero);
1970 				bp += sizeof(u_int32_t);
1971 			} else {
1972 				if (F_ISSET(dbp, DB_AM_SWAP) &&
1973 				    (ret = __db_pageswap(env, dbp, pghdrstart,
1974 				    (size_t)header->size, (DBT *)data, 0)) != 0)
1975 					return (ret);
1976 				LOGCOPY_32(env, bp, &data->size);
1977 				bp += sizeof(data->size);
1978 				memcpy(bp, data->data, data->size);
1979 				if (F_ISSET(dbp, DB_AM_SWAP) &&
1980 				     F_ISSET(data, DB_DBT_APPMALLOC))
1981 					__os_free(env, data->data);
1982 				bp += data->size;
1983 			}
1984 			break;
1985 		case LOGREC_POINTER:
1986 			pagelsn = va_arg(argp, DB_LSN *);
1987 			if (pagelsn != NULL) {
1988 				if (txnp != NULL) {
1989 					if (LOG_COMPARE(pagelsn,
1990 					    &lp->lsn) >= 0 && (ret =
1991 					    __log_check_page_lsn(env,
1992 					    dbp, pagelsn)) != 0)
1993 						return (ret);
1994 				}
1995 				LOGCOPY_FROMLSN(env, bp, pagelsn);
1996 			} else
1997 				memset(bp, 0, sizeof(*pagelsn));
1998 			bp += sizeof(*pagelsn);
1999 			break;
2000 
2001 		default:
2002 			DB_ASSERT(env, sp->type != sp->type);
2003 		}
2004 	}
2005 
2006 	DB_ASSERT(env,
2007 	    (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
2008 
2009 	if (is_durable || txnp == NULL) {
2010 		if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
2011 		    flags | DB_LOG_NOCOPY)) == 0) {
2012 			if (txnp != NULL)
2013 				*lsnp = *rlsnp;
2014 			*ret_lsnp = *rlsnp;
2015 		}
2016 	} else {
2017 		ret = 0;
2018 #ifdef DIAGNOSTIC
2019 		/*
2020 		 * Set the debug bit if we are going to log non-durable
2021 		 * transactions so they will be ignored by recovery.
2022 		 */
2023 		memcpy(lr->data, logrec.data, logrec.size);
2024 		rectype |= DB_debug_FLAG;
2025 		LOGCOPY_32(env, logrec.data, &rectype);
2026 
2027 		if (!IS_REP_CLIENT(env) && !lp->db_log_inmemory)
2028 			ret = __log_put(env,
2029 			    rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
2030 #endif
2031 		STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
2032 		F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
2033 		LSN_NOT_LOGGED(*ret_lsnp);
2034 	}
2035 
2036 #ifdef LOG_DIAGNOSTIC
2037 	if (ret != 0)
2038 		(void)__db_addrem_print(env,
2039 		    (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
2040 #endif
2041 
2042 #ifdef DIAGNOSTIC
2043 	__os_free(env, logrec.data);
2044 #else
2045 	if (is_durable || txnp == NULL)
2046 		__os_free(env, logrec.data);
2047 #endif
2048 	return (ret);
2049 }
2050