1 /*-
2 * Copyright (c) 1996, 2020 Oracle and/or its affiliates. All rights reserved.
3 *
4 * See the file LICENSE for license information.
5 *
6 * $Id$
7 */
8
9 #include "db_config.h"
10
11 #include "db_int.h"
12 #include "dbinc/crypto.h"
13 #include "dbinc/hmac.h"
14 #include "dbinc/log.h"
15 #include "dbinc/txn.h"
16 #include "dbinc/db_page.h"
17 #include "dbinc_auto/db_ext.h"
18
19 static int __log_encrypt_record __P((ENV *, DBT *, HDR *, u_int32_t));
20 static int __log_file __P((ENV *, const DB_LSN *, char *, size_t));
21 static int __log_fill __P((DB_LOG *, DB_LSN *, void *, u_int32_t));
22 static int __log_flush_commit __P((ENV *, const DB_LSN *, u_int32_t));
23 static int __log_newfh __P((DB_LOG *, int));
24 static int __log_put_next __P((ENV *,
25 DB_LSN *, const DBT *, HDR *, DB_LSN *));
26 static int __log_put_record_int __P((ENV *, DB *, DB_TXN *, DB_LSN *,
27 u_int32_t, u_int32_t, u_int32_t, u_int32_t, DB_LOG_RECSPEC *, va_list));
28 static int __log_putr __P((DB_LOG *,
29 DB_LSN *, const DBT *, u_int32_t, HDR *));
30 static int __log_write __P((DB_LOG *, void *, u_int32_t));
31
32 /*
33 * __log_put_pp --
34 * ENV->log_put pre/post processing.
35 *
36 * PUBLIC: int __log_put_pp __P((DB_ENV *, DB_LSN *, const DBT *, u_int32_t));
37 */
38 int
__log_put_pp(dbenv,lsnp,udbt,flags)39 __log_put_pp(dbenv, lsnp, udbt, flags)
40 DB_ENV *dbenv;
41 DB_LSN *lsnp;
42 const DBT *udbt;
43 u_int32_t flags;
44 {
45 DB_THREAD_INFO *ip;
46 ENV *env;
47 int ret;
48
49 env = dbenv->env;
50
51 ENV_REQUIRES_CONFIG(env,
52 env->lg_handle, "DB_ENV->log_put", DB_INIT_LOG);
53
54 /* Validate arguments: check for allowed flags. */
55 if ((ret = __db_fchk(env, "DB_ENV->log_put", flags,
56 DB_LOG_CHKPNT | DB_LOG_COMMIT |
57 DB_FLUSH | DB_LOG_NOCOPY | DB_LOG_WRNOSYNC)) != 0)
58 return (ret);
59
60 /* DB_LOG_WRNOSYNC and DB_FLUSH are mutually exclusive. */
61 if (LF_ISSET(DB_LOG_WRNOSYNC) && LF_ISSET(DB_FLUSH))
62 return (__db_ferr(env, "DB_ENV->log_put", 1));
63
64 /* Replication clients should never write log records. */
65 if (IS_REP_CLIENT(env)) {
66 __db_errx(env, DB_STR("2511",
67 "DB_ENV->log_put is illegal on replication clients"));
68 return (EINVAL);
69 }
70
71 ENV_ENTER(env, ip);
72 REPLICATION_WRAP(env, (__log_put(env, lsnp, udbt, flags)), 0, ret);
73 ENV_LEAVE(env, ip);
74 return (ret);
75 }
76
77 /*
78 * __log_put --
79 * ENV->log_put.
80 *
81 * PUBLIC: int __log_put __P((ENV *, DB_LSN *, const DBT *, u_int32_t));
82 */
83 int
__log_put(env,lsnp,udbt,flags)84 __log_put(env, lsnp, udbt, flags)
85 ENV *env;
86 DB_LSN *lsnp;
87 const DBT *udbt;
88 u_int32_t flags;
89 {
90 DBT *dbt, t;
91 DB_CIPHER *db_cipher;
92 DB_LOG *dblp;
93 DB_LSN lsn, old_lsn;
94 DB_REP *db_rep;
95 HDR hdr;
96 LOG *lp;
97 REP *rep;
98 int lock_held, need_free, ret;
99 u_int8_t *key;
100
101 COMPQUIET(rep, NULL);
102
103 dblp = env->lg_handle;
104 lp = dblp->reginfo.primary;
105 db_cipher = env->crypto_handle;
106 db_rep = env->rep_handle;
107 if (db_rep != NULL)
108 rep = db_rep->region;
109 else
110 rep = NULL;
111
112 dbt = &t;
113 t = *udbt;
114 lock_held = need_free = 0;
115 ZERO_LSN(old_lsn);
116 hdr.len = hdr.prev = 0;
117
118 /*
119 * In general, if we are not a rep application, but are sharing a master
120 * rep env, we should not be writing log records. However, we can allow
121 * a non-replication-aware process to join a pre-existing repmgr
122 * environment, if env handle meets repmgr's DB_THREAD requirement.
123 */
124
125 if (IS_REP_MASTER(env) && db_rep->send == NULL) {
126 #ifdef HAVE_REPLICATION_THREADS
127 if (F_ISSET(env, ENV_THREAD) && APP_IS_REPMGR(env)) {
128 if ((ret = __repmgr_autostart(env)) != 0)
129 return (ret);
130 } else
131 #endif
132 {
133 #if !defined(DEBUG_ROP) && !defined(DEBUG_WOP)
134 __db_errx(env, DB_STR("2512",
135 "Non-replication DB_ENV handle attempting "
136 "to modify a replicated environment"));
137 return (EINVAL);
138 #endif
139 }
140 }
141
142 if (IS_REP_CLIENT(env)) {
143 __db_errx(env, DB_STR("2590",
144 "log_put is illegal on replication clients"));
145 #if !defined(DIAGNOSTIC)
146 /*
147 * DB_ASSERT would generate a stack if DIAGNOSTIC is true.
148 */
149 __os_stack(env);
150 return (__env_panic(env, EINVAL));
151 #endif
152
153 DB_ASSERT(env, FALSE);
154 }
155
156 /*
157 * If we are coming from the logging code, we use an internal flag,
158 * DB_LOG_NOCOPY, because we know we can overwrite/encrypt the log
159 * record in place. Otherwise, if a user called log_put then we
160 * must copy it to new memory so that we know we can write it.
161 *
162 * We also must copy it to new memory if we are a replication master
163 * so that we retain an unencrypted copy of the log record to send
164 * to clients.
165 */
166 if (!LF_ISSET(DB_LOG_NOCOPY) || IS_REP_MASTER(env)) {
167 if (CRYPTO_ON(env))
168 t.size += db_cipher->adj_size(udbt->size);
169 if ((ret = __os_calloc(env, 1, t.size, &t.data)) != 0)
170 goto err;
171 need_free = 1;
172 memcpy(t.data, udbt->data, udbt->size);
173 }
174 if ((ret = __log_encrypt_record(env, dbt, &hdr, udbt->size)) != 0)
175 goto err;
176 if (CRYPTO_ON(env))
177 key = db_cipher->mac_key;
178 else
179 key = NULL;
180 #ifdef HAVE_LOG_CHECKSUM
181 __db_chksum(&hdr, dbt->data, dbt->size, key, hdr.chksum);
182 #endif
183
184 LOG_SYSTEM_LOCK(env);
185 lock_held = 1;
186
187 if ((ret = __log_put_next(env, &lsn, dbt, &hdr, &old_lsn)) != 0)
188 goto panic_check;
189
190 /*
191 * Assign the return LSN before dropping the region lock. Necessary
192 * in case the lsn is a begin_lsn from a TXN_DETAIL structure passed in
193 * by the logging routines. We use atomic 32-bit operations because
194 * during commit this will be a TXN_DETAIL visible_lsn field, and MVCC
195 * relies on reading the fields atomically.
196 */
197 lsnp->file = lsn.file;
198 lsnp->offset = lsn.offset;
199
200 #ifdef HAVE_REPLICATION
201 if (IS_REP_MASTER(env)) {
202 __rep_newfile_args nf_args;
203 DBT newfiledbt;
204 REP_BULK bulk;
205 size_t len;
206 u_int32_t ctlflags;
207 u_int8_t buf[__REP_NEWFILE_SIZE];
208
209 /*
210 * Replication masters need to drop the lock to send messages,
211 * but want to drop and reacquire it a minimal number of times.
212 */
213 ctlflags = LF_ISSET(DB_LOG_COMMIT | DB_LOG_CHKPNT) ?
214 REPCTL_PERM : 0;
215 LOG_SYSTEM_UNLOCK(env);
216 lock_held = 0;
217 if (LF_ISSET(DB_FLUSH))
218 ctlflags |= REPCTL_FLUSH;
219
220 /*
221 * If we changed files and we're in a replicated environment,
222 * we need to inform our clients now that we've dropped the
223 * region lock.
224 *
225 * Note that a failed NEWFILE send is a dropped message that
226 * our client can handle, so we can ignore it. It's possible
227 * that the record we already put is a commit, so we don't just
228 * want to return failure.
229 */
230 if (!IS_ZERO_LSN(old_lsn)) {
231 memset(&newfiledbt, 0, sizeof(newfiledbt));
232 nf_args.version = lp->persist.version;
233 (void)__rep_newfile_marshal(env, &nf_args,
234 buf, __REP_NEWFILE_SIZE, &len);
235 DB_INIT_DBT(newfiledbt, buf, len);
236 (void)__rep_send_message(env, DB_EID_BROADCAST,
237 REP_NEWFILE, &old_lsn, &newfiledbt, 0, 0);
238 }
239
240 /*
241 * If we're doing bulk processing put it in the bulk buffer.
242 */
243 ret = 0;
244 if (FLD_ISSET(rep->config, REP_C_BULK)) {
245 /*
246 * Bulk could have been turned on by another process.
247 * If so, set the address into the bulk region now.
248 */
249 if (db_rep->bulk == NULL)
250 db_rep->bulk = R_ADDR(&dblp->reginfo,
251 lp->bulk_buf);
252 memset(&bulk, 0, sizeof(bulk));
253 bulk.addr = db_rep->bulk;
254 bulk.offp = &lp->bulk_off;
255 bulk.len = lp->bulk_len;
256 bulk.lsn = lsn;
257 bulk.type = REP_BULK_LOG;
258 bulk.eid = DB_EID_BROADCAST;
259 bulk.flagsp = &lp->bulk_flags;
260 ret = __rep_bulk_message(env, &bulk, NULL,
261 &lsn, udbt, ctlflags);
262 }
263 if (!FLD_ISSET(rep->config, REP_C_BULK) ||
264 ret == DB_REP_BULKOVF) {
265 /*
266 * Then send the log record itself on to our clients.
267 */
268 /*
269 * !!!
270 * In the crypto case, we MUST send the udbt, not the
271 * now-encrypted dbt. Clients have no way to decrypt
272 * without the header.
273 */
274 ret = __rep_send_message(env, DB_EID_BROADCAST,
275 REP_LOG, &lsn, udbt, ctlflags, 0);
276 }
277 if (FLD_ISSET(ctlflags, REPCTL_PERM)) {
278 LOG_SYSTEM_LOCK(env);
279 #ifdef HAVE_STATISTICS
280 if (IS_USING_LEASES(env))
281 rep->stat.st_lease_sends++;
282 #endif
283 /*
284 * Keep track of our last PERM lsn. Set this on a
285 * master under the log lock. When using leases, if
286 * we set max_perm_lsn too early (before the send)
287 * then we hit a lot of false invalid lease checks
288 * which all try to refresh and hurt performance.
289 */
290 if (LOG_COMPARE(&lp->max_perm_lsn, &lsn) < 0)
291 lp->max_perm_lsn = lsn;
292 LOG_SYSTEM_UNLOCK(env);
293 }
294 /*
295 * If the send fails and we're a commit or checkpoint,
296 * there's nothing we can do; the record's in the log.
297 * Flush it, even if we're running with TXN_NOSYNC,
298 * on the grounds that it should be in durable form somewhere.
299 */
300 if (ret != 0 && FLD_ISSET(ctlflags, REPCTL_PERM))
301 LF_SET(DB_FLUSH);
302 /*
303 * We ignore send failures so reset 'ret' to 0 here.
304 * We needed to check special return values from
305 * bulk transfer and errors from either bulk or normal
306 * message sending need flushing on perm records. But
307 * otherwise we need to ignore it and reset it now.
308 */
309 ret = 0;
310 }
311 #endif
312
313 /*
314 * If needed, do a flush. Note that failures at this point
315 * are only permissible if we know we haven't written a commit
316 * record; __log_flush_commit is responsible for enforcing this.
317 *
318 * If a flush is not needed, see if WRITE_NOSYNC was set and we
319 * need to write out the log buffer.
320 */
321 if (LF_ISSET(DB_FLUSH | DB_LOG_WRNOSYNC)) {
322 if (!lock_held) {
323 LOG_SYSTEM_LOCK(env);
324 lock_held = 1;
325 }
326 if ((ret = __log_flush_commit(env, &lsn, flags)) != 0)
327 goto panic_check;
328 }
329
330 /*
331 * If flushed a checkpoint record, reset the "bytes since the last
332 * checkpoint" counters.
333 */
334 if (LF_ISSET(DB_LOG_CHKPNT))
335 lp->stat.st_wc_bytes = lp->stat.st_wc_mbytes = 0;
336
337 /* Increment count of records added to the log. */
338 STAT(++lp->stat.st_record);
339
340 if (0) {
341 panic_check: /*
342 * Writing log records cannot fail if we're a replication
343 * master. The reason is that once we send the record to
344 * replication clients, the transaction can no longer
345 * abort, otherwise the master would be out of sync with
346 * the rest of the replication group. Panic the system.
347 */
348 if (ret != 0 && IS_REP_MASTER(env))
349 ret = __env_panic(env, ret);
350 }
351
352 err: if (lock_held)
353 LOG_SYSTEM_UNLOCK(env);
354 if (need_free)
355 __os_free(env, dbt->data);
356
357 /*
358 * If auto-remove is set and we switched files, remove unnecessary
359 * log files.
360 */
361 if (ret == 0 && !IS_ZERO_LSN(old_lsn) && lp->db_log_autoremove)
362 __log_autoremove(env);
363
364 return (ret);
365 }
366
367 /*
368 * __log_current_lsn_int --
369 * internal operations of __log_current_lsn
370 *
371 * PUBLIC: int __log_current_lsn_int
372 * PUBLIC: __P((ENV *, DB_LSN *, u_int32_t *, u_int32_t *));
373 */
374 int
__log_current_lsn_int(env,lsnp,mbytesp,bytesp)375 __log_current_lsn_int(env, lsnp, mbytesp, bytesp)
376 ENV *env;
377 DB_LSN *lsnp;
378 u_int32_t *mbytesp, *bytesp;
379 {
380 DB_LOG *dblp;
381 LOG *lp;
382
383 dblp = env->lg_handle;
384 lp = dblp->reginfo.primary;
385
386 LOG_SYSTEM_LOCK(env);
387
388 /*
389 * We need the LSN of the last entry in the log.
390 *
391 * Typically, it's easy to get the last written LSN, you simply look
392 * at the current log pointer and back up the number of bytes of the
393 * last log record. However, if the last thing we did was write the
394 * log header of a new log file, then, this doesn't work, so we return
395 * the first log record that will be written in this new file.
396 */
397 *lsnp = lp->lsn;
398 if (lp->lsn.offset > lp->len)
399 lsnp->offset -= lp->len;
400
401 /*
402 * Since we're holding the log region lock, return the bytes put into
403 * the log since the last checkpoint, transaction checkpoint needs it.
404 *
405 * We add the current buffer offset so as to count bytes that have not
406 * yet been written, but are sitting in the log buffer.
407 */
408 if (mbytesp != NULL) {
409 *mbytesp = lp->stat.st_wc_mbytes;
410 *bytesp = (u_int32_t)(lp->stat.st_wc_bytes + lp->b_off);
411 }
412
413 LOG_SYSTEM_UNLOCK(env);
414
415 return (0);
416 }
417
418 /*
419 * __log_current_lsn --
420 * Return the current LSN.
421 *
422 * PUBLIC: int __log_current_lsn
423 * PUBLIC: __P((ENV *, DB_LSN *, u_int32_t *, u_int32_t *));
424 */
425 int
__log_current_lsn(env,lsnp,mbytesp,bytesp)426 __log_current_lsn(env, lsnp, mbytesp, bytesp)
427 ENV *env;
428 DB_LSN *lsnp;
429 u_int32_t *mbytesp, *bytesp;
430 {
431 DB_THREAD_INFO *ip;
432 int ret;
433
434 ret = 0;
435 ENV_ENTER(env, ip);
436 ret = __log_current_lsn_int(env, lsnp, mbytesp, bytesp);
437 ENV_LEAVE(env, ip);
438
439 return ret;
440 }
441
442 /*
443 * __log_put_next --
444 * Put the given record as the next in the log, wherever that may
445 * turn out to be.
446 */
447 static int
__log_put_next(env,lsn,dbt,hdr,old_lsnp)448 __log_put_next(env, lsn, dbt, hdr, old_lsnp)
449 ENV *env;
450 DB_LSN *lsn;
451 const DBT *dbt;
452 HDR *hdr;
453 DB_LSN *old_lsnp;
454 {
455 DB_LOG *dblp;
456 DB_LSN old_lsn;
457 LOG *lp;
458 int adv_file, newfile, ret;
459
460 dblp = env->lg_handle;
461 lp = dblp->reginfo.primary;
462
463 /*
464 * Save a copy of lp->lsn before we might decide to switch log
465 * files and change it. If we do switch log files, and we're
466 * doing replication, we'll need to tell our clients about the
467 * switch, and they need to receive a NEWFILE message
468 * with this "would-be" LSN in order to know they're not
469 * missing any log records.
470 */
471 old_lsn = lp->lsn;
472 newfile = 0;
473 adv_file = 0;
474 /*
475 * If our current log is at an older version and we want to write
476 * a record then we need to advance the log.
477 */
478 if (lp->persist.version != DB_LOGVERSION) {
479 __log_set_version(env, DB_LOGVERSION);
480 adv_file = 1;
481 }
482
483 /*
484 * If this information won't fit in the file, or if we're a
485 * replication client environment and have been told to do so,
486 * swap files.
487 */
488 if (adv_file || lp->lsn.offset == 0 ||
489 lp->lsn.offset + hdr->size + dbt->size > lp->log_size) {
490 if (hdr->size + sizeof(LOGP) + dbt->size > lp->log_nsize) {
491 __db_errx(env, DB_STR_A("2513",
492 "DB_ENV->log_put: record larger than maximum file size (%lu > %lu)",
493 "%lu %lu"),
494 (u_long)hdr->size + sizeof(LOGP) + dbt->size,
495 (u_long)lp->log_nsize);
496 return (EINVAL);
497 }
498
499 if ((ret = __log_newfile(dblp, NULL, 0, 0)) != 0)
500 return (ret);
501
502 /*
503 * Flag that we switched files, in case we're a master
504 * and need to send this information to our clients.
505 * We postpone doing the actual send until we can
506 * safely release the log region lock and are doing so
507 * anyway.
508 */
509 newfile = 1;
510 }
511
512 /* If we switched log files, let our caller know where. */
513 if (newfile)
514 *old_lsnp = old_lsn;
515
516 /* Actually put the record. */
517 return (__log_putr(dblp, lsn, dbt, lp->lsn.offset - lp->len, hdr));
518 }
519
520 /*
521 * __log_flush_commit --
522 * Flush a record.
523 */
524 static int
__log_flush_commit(env,lsnp,flags)525 __log_flush_commit(env, lsnp, flags)
526 ENV *env;
527 const DB_LSN *lsnp;
528 u_int32_t flags;
529 {
530 DB_LOG *dblp;
531 DB_LSN flush_lsn;
532 HDR hdr;
533 LOG *lp;
534 int ret, t_ret;
535 size_t nr, nw;
536 u_int8_t *buffer;
537
538 dblp = env->lg_handle;
539 lp = dblp->reginfo.primary;
540 flush_lsn = *lsnp;
541
542 ret = 0;
543
544 /*
545 * DB_FLUSH:
546 * Flush a record for which the DB_FLUSH flag to log_put was set.
547 *
548 * DB_LOG_WRNOSYNC:
549 * If there's anything in the current log buffer, write it out.
550 */
551 if (LF_ISSET(DB_FLUSH))
552 ret = __log_flush_int(dblp, &flush_lsn, 1);
553 else if (!lp->db_log_inmemory && lp->b_off != 0)
554 if ((ret = __log_write(dblp,
555 dblp->bufp, (u_int32_t)lp->b_off)) == 0)
556 lp->b_off = 0;
557
558 /*
559 * If a flush supporting a transaction commit fails, we must abort the
560 * transaction. (If we aren't doing a commit, return the failure; if
561 * if the commit we care about made it to disk successfully, we just
562 * ignore the failure, because there's no way to undo the commit.)
563 */
564 if (ret == 0 || !LF_ISSET(DB_LOG_COMMIT))
565 return (ret);
566
567 if (LF_ISSET(DB_FLUSH) ?
568 flush_lsn.file != lp->s_lsn.file ||
569 flush_lsn.offset < lp->s_lsn.offset :
570 flush_lsn.file != lp->lsn.file || flush_lsn.offset < lp->w_off)
571 return (0);
572
573 if (IS_REP_MASTER(env)) {
574 __db_err(env, ret, DB_STR("2514",
575 "Write failed on MASTER commit."));
576 return (__env_panic(env, ret));
577 }
578 /*
579 * If this is a panic don't attempt to abort just this transaction;
580 * it may trip over the panic, and the whole env needs to go anyway.
581 */
582 if (ret == DB_RUNRECOVERY)
583 return (__env_panic(env, ret));
584 /*
585 * Else, make sure that the commit record does not get out after we
586 * abort the transaction. Do this by overwriting the commit record
587 * in the buffer. (Note that other commits in this buffer will wait
588 * until a successful write happens, we do not wake them.) We point
589 * at the right part of the buffer and write an abort record over the
590 * commit. We must then try and flush the buffer again, since the
591 * interesting part of the buffer may have actually made it out to
592 * disk before there was a failure, we can't know for sure.
593 */
594 if (flush_lsn.offset > lp->w_off) {
595 if ((t_ret = __txn_force_abort(env,
596 dblp->bufp + flush_lsn.offset - lp->w_off)) != 0)
597 return (__env_panic(env, t_ret));
598 } else {
599 /*
600 * The buffer was written, but its not on disk, we
601 * must read it back and force things from a commit
602 * state to an abort state. Lots of things could fail
603 * here and we will be left with a commit record but
604 * a panic return.
605 */
606 if (
607 (t_ret = __os_seek(env,
608 dblp->lfhp, 0, 0, flush_lsn.offset)) != 0 ||
609 (t_ret = __os_read(env, dblp->lfhp, &hdr,
610 HDR_NORMAL_SZ, &nr)) != 0 || nr != HDR_NORMAL_SZ)
611 return (__env_panic(env, t_ret == 0 ? EIO : t_ret));
612 if (LOG_SWAPPED(env))
613 __log_hdrswap(&hdr, CRYPTO_ON(env));
614 if ((t_ret = __os_malloc(env, hdr.len, &buffer)) != 0 ||
615 (t_ret = __os_seek(env,
616 dblp->lfhp, 0, 0, flush_lsn.offset)) != 0 ||
617 (t_ret = __os_read(env, dblp->lfhp, buffer,
618 hdr.len, &nr)) != 0 || nr != hdr.len ||
619 (t_ret = __txn_force_abort(env, buffer)) != 0 ||
620 (t_ret = __os_seek(env,
621 dblp->lfhp, 0, 0, flush_lsn.offset)) != 0 ||
622 (t_ret = __os_write(env, dblp->lfhp, buffer,
623 nr, &nw)) != 0 || nw != nr)
624 return (__env_panic(env, t_ret == 0 ? EIO : t_ret));
625 __os_free(env, buffer);
626 }
627 /*
628 * Try to flush the log again, if the disk just bounced then we
629 * want to be sure it does not go away again before we write the
630 * abort record.
631 */
632 (void)__log_flush_int(dblp, &flush_lsn, 0);
633
634 return (ret);
635 }
636
637 /*
638 * __log_newfile --
639 * Initialize and switch to a new log file. (Note that this is
640 * called both when no log yet exists and when we fill a log file.)
641 *
642 * PUBLIC: int __log_newfile __P((DB_LOG *, DB_LSN *, u_int32_t, u_int32_t));
643 */
644 int
__log_newfile(dblp,lsnp,logfile,version)645 __log_newfile(dblp, lsnp, logfile, version)
646 DB_LOG *dblp;
647 DB_LSN *lsnp;
648 u_int32_t logfile;
649 u_int32_t version;
650 {
651 DBT t;
652 DB_CIPHER *db_cipher;
653 DB_LSN lsn;
654 ENV *env;
655 HDR hdr;
656 LOG *lp;
657 LOGP *tpersist;
658 int need_free, ret;
659 u_int32_t lastoff;
660 size_t tsize;
661
662 env = dblp->env;
663 lp = dblp->reginfo.primary;
664
665 /*
666 * If we're not specifying a specific log file number and we're
667 * not at the beginning of a file already, start a new one.
668 */
669 if (logfile == 0 && lp->lsn.offset != 0) {
670 /*
671 * Flush the log so this file is out and can be closed. We
672 * cannot release the region lock here because we need to
673 * protect the end of the file while we switch. In
674 * particular, a thread with a smaller record than ours
675 * could detect that there is space in the log. Even
676 * blocking that event by declaring the file full would
677 * require all threads to wait here so that the lsn.file
678 * can be moved ahead after the flush completes. This
679 * probably can be changed if we had an lsn for the
680 * previous file and one for the current, but it does not
681 * seem like this would get much more throughput, if any.
682 */
683 if ((ret = __log_flush_int(dblp, NULL, 0)) != 0)
684 return (ret);
685
686 /*
687 * Save the last known offset from the previous file, we'll
688 * need it to initialize the persistent header information.
689 */
690 lastoff = lp->lsn.offset;
691
692 /* Point the current LSN to the new file. */
693 ++lp->lsn.file;
694 lp->lsn.offset = 0;
695
696 /* Reset the file write offset. */
697 lp->w_off = 0;
698 } else
699 lastoff = 0;
700
701 /*
702 * Replication may require we reset the log file name space entirely.
703 * In that case we also force a file switch so that replication can
704 * clean up old files.
705 */
706 if (logfile != 0) {
707 lp->lsn.file = logfile;
708 lp->lsn.offset = 0;
709 lp->w_off = 0;
710 if (lp->db_log_inmemory) {
711 lsn = lp->lsn;
712 (void)__log_zero(env, &lsn);
713 } else {
714 lp->s_lsn = lp->lsn;
715 if ((ret = __log_newfh(dblp, 1)) != 0)
716 return (ret);
717 }
718 }
719
720 DB_ASSERT(env, lp->db_log_inmemory || lp->b_off == 0);
721 if (lp->db_log_inmemory &&
722 (ret = __log_inmem_newfile(dblp, lp->lsn.file)) != 0)
723 return (ret);
724
725 /*
726 * Insert persistent information as the first record in every file.
727 * Note that the previous length is wrong for the very first record
728 * of the log, but that's okay, we check for it during retrieval.
729 */
730 memset(&t, 0, sizeof(t));
731 memset(&hdr, 0, sizeof(HDR));
732
733 need_free = 0;
734 tsize = sizeof(LOGP);
735 db_cipher = env->crypto_handle;
736 if (CRYPTO_ON(env))
737 tsize += db_cipher->adj_size(tsize);
738 if ((ret = __os_calloc(env, 1, tsize, &tpersist)) != 0)
739 return (ret);
740 need_free = 1;
741 /*
742 * If we're told what version to make this file, then we
743 * need to be at that version. Update here.
744 */
745 if (version != 0) {
746 __log_set_version(env, version);
747 if ((ret = __env_init_rec(env, version)) != 0)
748 goto err;
749 }
750 lp->persist.log_size = lp->log_size = lp->log_nsize;
751 memcpy(tpersist, &lp->persist, sizeof(LOGP));
752 DB_SET_DBT(t, tpersist, tsize);
753 if (LOG_SWAPPED(env))
754 __log_persistswap(tpersist);
755
756 if ((ret =
757 __log_encrypt_record(env, &t, &hdr, (u_int32_t)sizeof(LOGP))) != 0)
758 goto err;
759
760 if ((ret = __log_putr(dblp, &lsn,
761 &t, lastoff == 0 ? 0 : lastoff - lp->len, &hdr)) != 0)
762 goto err;
763
764 /* Update the LSN information returned to the caller. */
765 if (lsnp != NULL)
766 *lsnp = lp->lsn;
767
768 err: if (need_free)
769 __os_free(env, tpersist);
770 return (ret);
771 }
772
773 /*
774 * __log_putr --
775 * Actually put a record into the log.
776 */
777 static int
__log_putr(dblp,lsn,dbt,prev,h)778 __log_putr(dblp, lsn, dbt, prev, h)
779 DB_LOG *dblp;
780 DB_LSN *lsn;
781 const DBT *dbt;
782 u_int32_t prev;
783 HDR *h;
784 {
785 DB_CIPHER *db_cipher;
786 DB_LSN f_lsn;
787 ENV *env;
788 HDR tmp, *hdr;
789 LOG *lp;
790 int ret, t_ret;
791 db_size_t b_off;
792 size_t nr;
793 u_int32_t w_off;
794
795 env = dblp->env;
796 lp = dblp->reginfo.primary;
797
798 /*
799 * If we weren't given a header, use a local one.
800 */
801 db_cipher = env->crypto_handle;
802 if (h == NULL) {
803 hdr = &tmp;
804 memset(hdr, 0, sizeof(HDR));
805 if (CRYPTO_ON(env))
806 hdr->size = HDR_CRYPTO_SZ;
807 else
808 hdr->size = HDR_NORMAL_SZ;
809 } else
810 hdr = h;
811
812 /* Save our position in case we fail. */
813 b_off = lp->b_off;
814 w_off = lp->w_off;
815 f_lsn = lp->f_lsn;
816
817 /*
818 * Initialize the header. If we just switched files, lsn.offset will
819 * be 0, and what we really want is the offset of the previous record
820 * in the previous file. Fortunately, prev holds the value we want.
821 */
822 hdr->prev = prev;
823 hdr->len = (u_int32_t)hdr->size + dbt->size;
824
825 #ifdef HAVE_LOG_CHECKSUM
826 /*
827 * If we were passed in a nonzero checksum, our caller calculated
828 * the checksum before acquiring the log mutex, as an optimization.
829 *
830 * If our caller calculated a real checksum of 0, we'll needlessly
831 * recalculate it. C'est la vie; there's no out-of-bounds value
832 * here.
833 */
834 if (hdr->chksum[0] == 0) {
835 if (lp->persist.version < DB_LOGCHKSUM)
836 __db_chksum(NULL, dbt->data, dbt->size,
837 (CRYPTO_ON(env)) ? db_cipher->mac_key : NULL,
838 hdr->chksum);
839 else
840 __db_chksum(hdr, dbt->data, dbt->size,
841 (CRYPTO_ON(env)) ? db_cipher->mac_key : NULL,
842 hdr->chksum);
843 } else if (lp->persist.version >= DB_LOGCHKSUM)
844 /*
845 * We need to include hdr->prev and len here, since they were
846 * still zero at the time of the caller's __db_chksum() call.
847 */
848 LOG_HDR_SUM(CRYPTO_ON(env), hdr, hdr->chksum);
849 #endif
850
851 if (lp->db_log_inmemory && (ret = __log_inmem_chkspace(dblp,
852 (u_int32_t)hdr->size + dbt->size)) != 0)
853 goto err;
854
855 /*
856 * The offset into the log file at this point is the LSN where
857 * we're about to put this record, and is the LSN the caller wants.
858 */
859 *lsn = lp->lsn;
860
861 nr = hdr->size;
862 if (LOG_SWAPPED(env))
863 __log_hdrswap(hdr, CRYPTO_ON(env));
864
865 /* nr can't overflow a 32 bit value - header size is internal. */
866 ret = __log_fill(dblp, lsn, hdr, (u_int32_t)nr);
867
868 if (LOG_SWAPPED(env))
869 __log_hdrswap(hdr, CRYPTO_ON(env));
870
871 if (ret != 0)
872 goto err;
873
874 if ((ret = __log_fill(dblp, lsn, dbt->data, dbt->size)) != 0)
875 goto err;
876
877 lp->len = (u_int32_t)(hdr->size + dbt->size);
878 lp->lsn.offset += lp->len;
879 return (0);
880 err:
881 /*
882 * If we wrote more than one buffer before failing, get the
883 * first one back. The extra buffers will fail the checksums
884 * and be ignored.
885 */
886 if (w_off + lp->buffer_size < lp->w_off) {
887 DB_ASSERT(env, !lp->db_log_inmemory);
888 if ((t_ret = __os_seek(env, dblp->lfhp, 0, 0, w_off)) != 0 ||
889 (t_ret = __os_read(env, dblp->lfhp, dblp->bufp,
890 b_off, &nr)) != 0)
891 return (__env_panic(env, t_ret));
892 if (nr != b_off) {
893 __db_errx(env, DB_STR("2515",
894 "Short read while restoring log"));
895 return (__env_panic(env, EIO));
896 }
897 }
898
899 /* Reset to where we started. */
900 lp->w_off = w_off;
901 lp->b_off = b_off;
902 lp->f_lsn = f_lsn;
903
904 return (ret);
905 }
906
907 /*
908 * __log_flush_pp --
909 * ENV->log_flush pre/post processing.
910 *
911 * PUBLIC: int __log_flush_pp __P((DB_ENV *, const DB_LSN *));
912 */
913 int
__log_flush_pp(dbenv,lsn)914 __log_flush_pp(dbenv, lsn)
915 DB_ENV *dbenv;
916 const DB_LSN *lsn;
917 {
918 DB_THREAD_INFO *ip;
919 ENV *env;
920 int ret;
921
922 env = dbenv->env;
923
924 ENV_REQUIRES_CONFIG(env,
925 env->lg_handle, "DB_ENV->log_flush", DB_INIT_LOG);
926
927 ENV_ENTER(env, ip);
928 REPLICATION_WRAP(env, (__log_flush(env, lsn)), 0, ret);
929 ENV_LEAVE(env, ip);
930 return (ret);
931 }
932
933 /*
934 * See if we need to wait. s_lsn is not locked so some care is needed.
935 * The sync point can only move forward. The lsnp->file cannot be
936 * greater than the s_lsn.file. If the file we want is in the past
937 * we are done. If the file numbers are the same check the offset.
938 * This all assumes we can read an 32-bit quantity in one state or
939 * the other, not in transition.
940 */
941 #define ALREADY_FLUSHED(lp, lsnp) \
942 (((lp)->s_lsn.file > (lsnp)->file) || \
943 ((lp)->s_lsn.file == (lsnp)->file && \
944 (lp)->s_lsn.offset > (lsnp)->offset))
945
946 /*
947 * __log_flush --
948 * ENV->log_flush
949 *
950 * PUBLIC: int __log_flush __P((ENV *, const DB_LSN *));
951 */
952 int
__log_flush(env,lsn)953 __log_flush(env, lsn)
954 ENV *env;
955 const DB_LSN *lsn;
956 {
957 DB_LOG *dblp;
958 LOG *lp;
959 int ret;
960
961 dblp = env->lg_handle;
962 lp = dblp->reginfo.primary;
963 if (lsn != NULL && ALREADY_FLUSHED(lp, lsn))
964 return (0);
965 LOG_SYSTEM_LOCK(env);
966 ret = __log_flush_int(dblp, lsn, 1);
967 LOG_SYSTEM_UNLOCK(env);
968 return (ret);
969 }
970
971 /*
972 * __log_flush_int --
973 * Write all records less than or equal to the specified LSN; internal
974 * version.
975 *
976 * PUBLIC: int __log_flush_int __P((DB_LOG *, const DB_LSN *, int));
977 */
978 int
__log_flush_int(dblp,lsnp,release)979 __log_flush_int(dblp, lsnp, release)
980 DB_LOG *dblp;
981 const DB_LSN *lsnp;
982 int release;
983 {
984 struct __db_commit *commit;
985 ENV *env;
986 DB_LSN flush_lsn, f_lsn;
987 LOG *lp;
988 size_t b_off;
989 u_int32_t ncommit, w_off;
990 int do_flush, first, ret;
991
992 env = dblp->env;
993 lp = dblp->reginfo.primary;
994 ncommit = 0;
995 ret = 0;
996
997 if (lp->db_log_inmemory) {
998 lp->s_lsn = lp->lsn;
999 STAT(++lp->stat.st_scount);
1000 return (0);
1001 }
1002
1003 /*
1004 * If no LSN specified, flush the entire log by setting the flush LSN
1005 * to the last LSN written in the log. Otherwise, check that the LSN
1006 * isn't a non-existent record for the log.
1007 */
1008 if (lsnp == NULL) {
1009 flush_lsn.file = lp->lsn.file;
1010 flush_lsn.offset = lp->lsn.offset - lp->len;
1011 } else if (lsnp->file > lp->lsn.file ||
1012 (lsnp->file == lp->lsn.file &&
1013 lsnp->offset > lp->lsn.offset - lp->len)) {
1014 __db_errx(env, DB_STR_A("2516",
1015 "DB_ENV->log_flush: LSN of %lu/%lu past current end-of-log of %lu/%lu",
1016 "%lu %lu %lu %lu"), (u_long)lsnp->file,
1017 (u_long)lsnp->offset, (u_long)lp->lsn.file,
1018 (u_long)lp->lsn.offset);
1019 __db_errx(env, DB_STR("2517",
1020 "Database environment corrupt; the wrong log files may "
1021 "have been removed or incompatible database files "
1022 "imported from another environment"));
1023 return (__env_panic(env, DB_RUNRECOVERY));
1024 } else {
1025 if (ALREADY_FLUSHED(lp, lsnp))
1026 return (0);
1027 flush_lsn = *lsnp;
1028 }
1029
1030 /*
1031 * If a flush is in progress and we're allowed to do so, drop
1032 * the region lock and block waiting for the next flush.
1033 */
1034 if (release && lp->in_flush != 0) {
1035 if ((commit = SH_TAILQ_FIRST(
1036 &lp->free_commits, __db_commit)) == NULL) {
1037 if ((ret = __env_alloc(&dblp->reginfo,
1038 sizeof(struct __db_commit), &commit)) != 0)
1039 goto flush;
1040 memset(commit, 0, sizeof(*commit));
1041 if ((ret = __mutex_alloc(env, MTX_TXN_COMMIT,
1042 DB_MUTEX_SELF_BLOCK, &commit->mtx_txnwait)) != 0) {
1043 __env_alloc_free(&dblp->reginfo, commit);
1044 return (ret);
1045 }
1046 MUTEX_LOCK_NO_CTR(env, commit->mtx_txnwait);
1047 } else
1048 SH_TAILQ_REMOVE(
1049 &lp->free_commits, commit, links, __db_commit);
1050
1051 lp->ncommit++;
1052
1053 /*
1054 * Flushes may be requested out of LSN order; be
1055 * sure we only move lp->t_lsn forward.
1056 */
1057 if (LOG_COMPARE(&lp->t_lsn, &flush_lsn) < 0)
1058 lp->t_lsn = flush_lsn;
1059
1060 commit->lsn = flush_lsn;
1061 SH_TAILQ_INSERT_HEAD(
1062 &lp->commits, commit, links, __db_commit);
1063 LOG_SYSTEM_UNLOCK(env);
1064 /* Wait here for the in-progress flush to finish. */
1065 MUTEX_LOCK_NO_CTR(env, commit->mtx_txnwait);
1066 LOG_SYSTEM_LOCK(env);
1067
1068 lp->ncommit--;
1069 /*
1070 * Grab the flag before freeing the struct to see if
1071 * we need to flush the log to commit. If so,
1072 * use the maximal lsn for any committing thread.
1073 */
1074 do_flush = F_ISSET(commit, DB_COMMIT_FLUSH);
1075 F_CLR(commit, DB_COMMIT_FLUSH);
1076 SH_TAILQ_INSERT_HEAD(
1077 &lp->free_commits, commit, links, __db_commit);
1078 if (do_flush) {
1079 lp->in_flush--;
1080 flush_lsn = lp->t_lsn;
1081 } else
1082 return (0);
1083 }
1084
1085 /*
1086 * Protect flushing with its own mutex so we can release
1087 * the region lock except during file switches.
1088 */
1089 flush: MUTEX_LOCK(env, lp->mtx_flush);
1090
1091 /*
1092 * If the LSN is less than or equal to the last-sync'd LSN, we're done.
1093 * Note, the last-sync LSN saved in s_lsn is the LSN of the first byte
1094 * after the byte we absolutely know was written to disk, so the test
1095 * is <, not <=.
1096 */
1097 if (flush_lsn.file < lp->s_lsn.file ||
1098 (flush_lsn.file == lp->s_lsn.file &&
1099 flush_lsn.offset < lp->s_lsn.offset)) {
1100 MUTEX_UNLOCK(env, lp->mtx_flush);
1101 goto done;
1102 }
1103
1104 /*
1105 * We may need to write the current buffer. We have to write the
1106 * current buffer if the flush LSN is greater than or equal to the
1107 * buffer's starting LSN.
1108 *
1109 * Otherwise, it's still possible that this thread may never have
1110 * written to this log file. Acquire a file descriptor if we don't
1111 * already have one.
1112 */
1113 if (lp->b_off != 0 && LOG_COMPARE(&flush_lsn, &lp->f_lsn) >= 0) {
1114 if ((ret = __log_write(dblp,
1115 dblp->bufp, (u_int32_t)lp->b_off)) != 0) {
1116 MUTEX_UNLOCK(env, lp->mtx_flush);
1117 goto done;
1118 }
1119
1120 lp->b_off = 0;
1121 } else if (dblp->lfhp == NULL || dblp->lfname != lp->lsn.file)
1122 if ((ret = __log_newfh(dblp, 0)) != 0) {
1123 MUTEX_UNLOCK(env, lp->mtx_flush);
1124 goto done;
1125 }
1126
1127 /*
1128 * We are going to flush, release the region.
1129 * First get the current state of the buffer since
1130 * another write may come in, but we may not flush it.
1131 */
1132 b_off = lp->b_off;
1133 w_off = lp->w_off;
1134 f_lsn = lp->f_lsn;
1135 lp->in_flush++;
1136 if (release)
1137 LOG_SYSTEM_UNLOCK(env);
1138
1139 /* Sync all writes to disk. */
1140 if (!lp->nosync) {
1141 if ((ret = __os_fsync(env, dblp->lfhp)) != 0) {
1142 MUTEX_UNLOCK(env, lp->mtx_flush);
1143 if (release)
1144 LOG_SYSTEM_LOCK(env);
1145 lp->in_flush--;
1146 goto done;
1147 }
1148 STAT(++lp->stat.st_scount);
1149 }
1150
1151 /*
1152 * Set the last-synced LSN.
1153 * This value must be set to the LSN past the last complete
1154 * record that has been flushed. This is at least the first
1155 * lsn, f_lsn. If the buffer is empty, b_off == 0, then
1156 * we can move up to write point since the first lsn is not
1157 * set for the new buffer.
1158 */
1159 lp->s_lsn = f_lsn;
1160 if (b_off == 0)
1161 lp->s_lsn.offset = w_off;
1162
1163 MUTEX_UNLOCK(env, lp->mtx_flush);
1164 if (release)
1165 LOG_SYSTEM_LOCK(env);
1166
1167 lp->in_flush--;
1168
1169 /*
1170 * How many flush calls (usually commits) did this call actually sync?
1171 * At least one, if it got here.
1172 */
1173 ncommit = 1;
1174 done:
1175 if (lp->ncommit != 0) {
1176 first = 1;
1177 SH_TAILQ_FOREACH(commit, &lp->commits, links, __db_commit)
1178 if (LOG_COMPARE(&lp->s_lsn, &commit->lsn) > 0) {
1179 MUTEX_UNLOCK_NO_CTR(env, commit->mtx_txnwait);
1180 SH_TAILQ_REMOVE(
1181 &lp->commits, commit, links, __db_commit);
1182 ncommit++;
1183 } else if (first == 1) {
1184 F_SET(commit, DB_COMMIT_FLUSH);
1185 MUTEX_UNLOCK_NO_CTR(env, commit->mtx_txnwait);
1186 SH_TAILQ_REMOVE(
1187 &lp->commits, commit, links, __db_commit);
1188 /*
1189 * This thread will wake and flush.
1190 * If another thread commits and flushes
1191 * first we will waste a trip trough the
1192 * mutex.
1193 */
1194 lp->in_flush++;
1195 first = 0;
1196 }
1197 }
1198 #ifdef HAVE_STATISTICS
1199 if (lp->stat.st_maxcommitperflush < ncommit)
1200 lp->stat.st_maxcommitperflush = ncommit;
1201 if (lp->stat.st_mincommitperflush > ncommit ||
1202 lp->stat.st_mincommitperflush == 0)
1203 lp->stat.st_mincommitperflush = ncommit;
1204 #endif
1205
1206 return (ret);
1207 }
1208
1209 /*
1210 * __log_fill --
1211 * Write information into the log.
1212 */
1213 static int
__log_fill(dblp,lsn,addr,len)1214 __log_fill(dblp, lsn, addr, len)
1215 DB_LOG *dblp;
1216 DB_LSN *lsn;
1217 void *addr;
1218 u_int32_t len;
1219 {
1220 LOG *lp;
1221 u_int32_t bsize, nrec;
1222 size_t nw, remain;
1223 int ret;
1224
1225 lp = dblp->reginfo.primary;
1226 bsize = lp->buffer_size;
1227
1228 if (lp->db_log_inmemory) {
1229 __log_inmem_copyin(dblp, lp->b_off, addr, len);
1230 lp->b_off = (lp->b_off + len) % lp->buffer_size;
1231 return (0);
1232 }
1233
1234 while (len > 0) { /* Copy out the data. */
1235 /*
1236 * If we're beginning a new buffer, note the user LSN to which
1237 * the first byte of the buffer belongs. We have to know this
1238 * when flushing the buffer so that we know if the in-memory
1239 * buffer needs to be flushed.
1240 */
1241 if (lp->b_off == 0)
1242 lp->f_lsn = *lsn;
1243
1244 /*
1245 * If we're on a buffer boundary and the data is big enough,
1246 * copy as many records as we can directly from the data.
1247 */
1248 if (lp->b_off == 0 && len >= bsize) {
1249 nrec = len / bsize;
1250 if ((ret = __log_write(dblp, addr, nrec * bsize)) != 0)
1251 return (ret);
1252 addr = (u_int8_t *)addr + nrec * bsize;
1253 len -= nrec * bsize;
1254 STAT(++lp->stat.st_wcount_fill);
1255 continue;
1256 }
1257
1258 /* Figure out how many bytes we can copy this time. */
1259 remain = bsize - lp->b_off;
1260 nw = remain > len ? len : remain;
1261 memcpy(dblp->bufp + lp->b_off, addr, nw);
1262 addr = (u_int8_t *)addr + nw;
1263 len -= (u_int32_t)nw;
1264 lp->b_off += (u_int32_t)nw;
1265
1266 /* If we fill the buffer, flush it. */
1267 if (lp->b_off == bsize) {
1268 if ((ret = __log_write(dblp, dblp->bufp, bsize)) != 0)
1269 return (ret);
1270 lp->b_off = 0;
1271 STAT(++lp->stat.st_wcount_fill);
1272 }
1273 }
1274 return (0);
1275 }
1276
1277 /*
1278 * __log_write --
1279 * Write the log buffer to disk.
1280 */
1281 static int
__log_write(dblp,addr,len)1282 __log_write(dblp, addr, len)
1283 DB_LOG *dblp;
1284 void *addr;
1285 u_int32_t len;
1286 {
1287 ENV *env;
1288 LOG *lp;
1289 size_t nw;
1290 int ret;
1291
1292 env = dblp->env;
1293 lp = dblp->reginfo.primary;
1294
1295 DB_ASSERT(env, !lp->db_log_inmemory);
1296
1297 /*
1298 * If we haven't opened the log file yet or the current one has
1299 * changed, acquire a new log file. We are creating the file if we're
1300 * about to write to the start of it, in other words, if the write
1301 * offset is zero.
1302 */
1303 if (dblp->lfhp == NULL || dblp->lfname != lp->lsn.file ||
1304 dblp->lf_timestamp != lp->timestamp)
1305 if ((ret = __log_newfh(dblp, lp->w_off == 0)) != 0)
1306 return (ret);
1307
1308 /*
1309 * If we're writing the first block in a log file on a filesystem that
1310 * guarantees unwritten blocks are zero-filled, we set the size of the
1311 * file in advance. This increases sync performance on some systems,
1312 * because they don't need to update metadata on every sync.
1313 *
1314 * Ignore any error -- we may have run out of disk space, but that's no
1315 * reason to quit.
1316 */
1317 #ifdef HAVE_FILESYSTEM_NOTZERO
1318 if (lp->w_off == 0 && !__os_fs_notzero()) {
1319 #else
1320 if (lp->w_off == 0) {
1321 #endif
1322 (void)__db_file_extend(env, dblp->lfhp, lp->log_size);
1323 if (F_ISSET(dblp, DBLOG_ZERO))
1324 (void)__db_zero_extend(env, dblp->lfhp,
1325 0, lp->log_size/lp->buffer_size, lp->buffer_size);
1326
1327 }
1328
1329 /*
1330 * Seek to the offset in the file (someone may have written it
1331 * since we last did).
1332 */
1333 if ((ret = __os_io(env, DB_IO_WRITE,
1334 dblp->lfhp, 0, 0, lp->w_off, len, addr, &nw)) != 0)
1335 return (ret);
1336
1337 /* Reset the buffer offset and update the seek offset. */
1338 lp->w_off += len;
1339
1340 /* Update written statistics. */
1341 if ((lp->stat.st_wc_bytes += len) >= MEGABYTE) {
1342 lp->stat.st_wc_bytes -= MEGABYTE;
1343 ++lp->stat.st_wc_mbytes;
1344 }
1345 #ifdef HAVE_STATISTICS
1346 if ((lp->stat.st_w_bytes += len) >= MEGABYTE) {
1347 lp->stat.st_w_bytes -= MEGABYTE;
1348 ++lp->stat.st_w_mbytes;
1349 }
1350 ++lp->stat.st_wcount;
1351 #endif
1352
1353 return (0);
1354 }
1355
1356 /*
1357 * __log_file_pp --
1358 * ENV->log_file pre/post processing.
1359 *
1360 * PUBLIC: int __log_file_pp __P((DB_ENV *, const DB_LSN *, char *, size_t));
1361 */
1362 int
__log_file_pp(dbenv,lsn,namep,len)1363 __log_file_pp(dbenv, lsn, namep, len)
1364 DB_ENV *dbenv;
1365 const DB_LSN *lsn;
1366 char *namep;
1367 size_t len;
1368 {
1369 DB_THREAD_INFO *ip;
1370 ENV *env;
1371 int ret, set;
1372
1373 env = dbenv->env;
1374
1375 ENV_REQUIRES_CONFIG(env,
1376 env->lg_handle, "DB_ENV->log_file", DB_INIT_LOG);
1377
1378 if ((ret = __log_get_config(dbenv, DB_LOG_IN_MEMORY, &set)) != 0)
1379 return (ret);
1380 if (set) {
1381 __db_errx(env, DB_STR("2518",
1382 "DB_ENV->log_file is illegal with in-memory logs"));
1383 return (EINVAL);
1384 }
1385
1386 ENV_ENTER(env, ip);
1387 REPLICATION_WRAP(env, (__log_file(env, lsn, namep, len)), 0, ret);
1388 ENV_LEAVE(env, ip);
1389 return (ret);
1390 }
1391
1392 /*
1393 * __log_file --
1394 * ENV->log_file.
1395 */
1396 static int
__log_file(env,lsn,namep,len)1397 __log_file(env, lsn, namep, len)
1398 ENV *env;
1399 const DB_LSN *lsn;
1400 char *namep;
1401 size_t len;
1402 {
1403 DB_LOG *dblp;
1404 int ret;
1405 char *name;
1406
1407 dblp = env->lg_handle;
1408 LOG_SYSTEM_LOCK(env);
1409 ret = __log_name(dblp, lsn->file, &name, NULL, 0);
1410 LOG_SYSTEM_UNLOCK(env);
1411 if (ret != 0)
1412 return (ret);
1413
1414 /* Check to make sure there's enough room and copy the name. */
1415 if (len < strlen(name) + 1) {
1416 *namep = '\0';
1417 __db_errx(env, DB_STR("2519",
1418 "DB_ENV->log_file: name buffer is too short"));
1419 return (EINVAL);
1420 }
1421 (void)strcpy(namep, name);
1422 __os_free(env, name);
1423
1424 return (0);
1425 }
1426
1427 /*
1428 * __log_newfh --
1429 * Acquire a file handle for the current log file.
1430 */
1431 static int
__log_newfh(dblp,create)1432 __log_newfh(dblp, create)
1433 DB_LOG *dblp;
1434 int create;
1435 {
1436 ENV *env;
1437 LOG *lp;
1438 u_int32_t flags;
1439 int ret;
1440 logfile_validity status;
1441
1442 env = dblp->env;
1443 lp = dblp->reginfo.primary;
1444
1445 /* Close any previous file descriptor. */
1446 if (dblp->lfhp != NULL) {
1447 (void)__os_closehandle(env, dblp->lfhp);
1448 dblp->lfhp = NULL;
1449 }
1450
1451 flags = DB_OSO_SEQ |
1452 (create ? DB_OSO_CREATE : 0) |
1453 (F_ISSET(dblp, DBLOG_DIRECT) ? DB_OSO_DIRECT : 0) |
1454 (F_ISSET(dblp, DBLOG_DSYNC) ? DB_OSO_DSYNC : 0);
1455
1456 /* Get the path of the new file and open it. */
1457 dblp->lfname = lp->lsn.file;
1458 if ((ret = __log_valid(dblp, dblp->lfname, 0, &dblp->lfhp,
1459 flags, &status, NULL)) != 0)
1460 __db_err(env, ret,
1461 "DB_ENV->log_newfh: %lu", (u_long)lp->lsn.file);
1462 else if (status != DB_LV_NORMAL && status != DB_LV_INCOMPLETE &&
1463 status != DB_LV_OLD_READABLE)
1464 ret = USR_ERR(env, DB_NOTFOUND);
1465
1466 return (ret);
1467 }
1468
1469 /*
1470 * __log_name --
1471 * Return the log name for a particular file, and optionally open it.
1472 *
1473 * PUBLIC: int __log_name __P((DB_LOG *,
1474 * PUBLIC: u_int32_t, char **, DB_FH **, u_int32_t));
1475 */
1476 int
__log_name(dblp,filenumber,namep,fhpp,flags)1477 __log_name(dblp, filenumber, namep, fhpp, flags)
1478 DB_LOG *dblp;
1479 u_int32_t filenumber, flags;
1480 char **namep;
1481 DB_FH **fhpp;
1482 {
1483 ENV *env;
1484 LOG *lp;
1485 int mode, ret;
1486 char *oname;
1487 char old[sizeof(LFPREFIX) + 5 + 20], new[sizeof(LFPREFIX) + 10 + 20];
1488
1489 env = dblp->env;
1490 lp = dblp->reginfo.primary;
1491
1492 DB_ASSERT(env, !lp->db_log_inmemory);
1493
1494 /*
1495 * !!!
1496 * The semantics of this routine are bizarre.
1497 *
1498 * The reason for all of this is that we need a place where we can
1499 * intercept requests for log files, and, if appropriate, check for
1500 * both the old-style and new-style log file names. The trick is
1501 * that all callers of this routine that are opening the log file
1502 * read-only want to use an old-style file name if they can't find
1503 * a match using a new-style name. The only down-side is that some
1504 * callers may check for the old-style when they really don't need
1505 * to, but that shouldn't mess up anything, and we only check for
1506 * the old-style name when we've already failed to find a new-style
1507 * one.
1508 *
1509 * Create a new-style file name, and if we're not going to open the
1510 * file, return regardless.
1511 */
1512 (void)snprintf(new, sizeof(new), LFNAME, filenumber);
1513 if ((ret = __db_appname(env,
1514 DB_APP_LOG, new, NULL, namep)) != 0 || fhpp == NULL)
1515 return (ret);
1516
1517 /* The application may have specified an absolute file mode. */
1518 if (lp->filemode == 0)
1519 mode = env->db_mode;
1520 else {
1521 LF_SET(DB_OSO_ABSMODE);
1522 mode = lp->filemode;
1523 }
1524
1525 /* Open the new-style file -- if we succeed, we're done. */
1526 dblp->lf_timestamp = lp->timestamp;
1527 if ((ret = __os_open(env, *namep, 0, flags, mode, fhpp)) == 0)
1528 return (0);
1529
1530 /*
1531 * If the open failed for reason other than the file
1532 * not being there, complain loudly, the wrong user
1533 * probably started up the application.
1534 */
1535 if (ret != ENOENT) {
1536 __db_err(env, ret, DB_STR_A("2520",
1537 "%s: log file unreadable", "%s"), *namep);
1538 return (__env_panic(env, ret));
1539 }
1540
1541 /*
1542 * The open failed... if the DB_RDONLY flag isn't set, we're done,
1543 * the caller isn't interested in old-style files.
1544 */
1545 if (!LF_ISSET(DB_OSO_RDONLY)) {
1546 __db_err(env, ret, DB_STR_A("2521",
1547 "%s: log file open failed", "%s"), *namep);
1548 return (__env_panic(env, ret));
1549 }
1550
1551 /* Create an old-style file name. */
1552 (void)snprintf(old, sizeof(old), LFNAME_V1, filenumber);
1553 if ((ret = __db_appname(env,
1554 DB_APP_LOG, old, NULL, &oname)) != 0)
1555 goto err;
1556
1557 /*
1558 * Open the old-style file -- if we succeed, we're done. Free the
1559 * space allocated for the new-style name and return the old-style
1560 * name to the caller.
1561 */
1562 if ((ret = __os_open(env, oname, 0, flags, mode, fhpp)) == 0) {
1563 __os_free(env, *namep);
1564 *namep = oname;
1565 return (0);
1566 }
1567
1568 /*
1569 * Couldn't find either style of name -- return the new-style name
1570 * for the caller's error message. If it's an old-style name that's
1571 * actually missing we're going to confuse the user with the error
1572 * message, but that implies that not only were we looking for an
1573 * old-style name, but we expected it to exist and we weren't just
1574 * looking for any log file. That's not a likely error.
1575 */
1576 err: __os_free(env, oname);
1577 return (ret);
1578 }
1579
1580 /*
1581 * __log_rep_put --
1582 * Short-circuit way for replication clients to put records into the
1583 * log. Replication clients' logs need to be laid out exactly as their masters'
1584 * are, so we let replication take responsibility for when the log gets
1585 * flushed, when log switches files, etc. This is just a thin PUBLIC wrapper
1586 * for __log_putr with a slightly prettier interface.
1587 *
1588 * Note that the REP->mtx_clientdb should be held when this is called.
1589 * Note that we acquire the log region mutex while holding mtx_clientdb.
1590 *
1591 * PUBLIC: int __log_rep_put __P((ENV *, DB_LSN *, const DBT *, u_int32_t));
1592 */
1593 int
__log_rep_put(env,lsnp,rec,flags)1594 __log_rep_put(env, lsnp, rec, flags)
1595 ENV *env;
1596 DB_LSN *lsnp;
1597 const DBT *rec;
1598 u_int32_t flags;
1599 {
1600 DBT *dbt, t;
1601 DB_CIPHER *db_cipher;
1602 DB_LOG *dblp;
1603 HDR hdr;
1604 LOG *lp;
1605 int need_free, ret;
1606
1607 dblp = env->lg_handle;
1608 lp = dblp->reginfo.primary;
1609
1610 LOG_SYSTEM_LOCK(env);
1611 memset(&hdr, 0, sizeof(HDR));
1612 t = *rec;
1613 dbt = &t;
1614 need_free = 0;
1615 db_cipher = env->crypto_handle;
1616 if (CRYPTO_ON(env))
1617 t.size += db_cipher->adj_size(rec->size);
1618 if ((ret = __os_calloc(env, 1, t.size, &t.data)) != 0)
1619 goto err;
1620 need_free = 1;
1621 memcpy(t.data, rec->data, rec->size);
1622
1623 if ((ret = __log_encrypt_record(env, dbt, &hdr, rec->size)) != 0)
1624 goto err;
1625
1626 DB_ASSERT(env, LOG_COMPARE(lsnp, &lp->lsn) == 0);
1627 ret = __log_putr(dblp, lsnp, dbt, lp->lsn.offset - lp->len, &hdr);
1628 err:
1629 /*
1630 * !!! Assume caller holds REP->mtx_clientdb to modify ready_lsn.
1631 */
1632 lp->ready_lsn = lp->lsn;
1633
1634 if (LF_ISSET(DB_LOG_CHKPNT))
1635 lp->stat.st_wc_bytes = lp->stat.st_wc_mbytes = 0;
1636
1637 /* Increment count of records added to the log. */
1638 STAT(++lp->stat.st_record);
1639 LOG_SYSTEM_UNLOCK(env);
1640 if (need_free)
1641 __os_free(env, t.data);
1642 return (ret);
1643 }
1644
1645 /*
1646 * __log_rep_write --
1647 * Way for replication clients to write the log buffer for the
1648 * DB_TXN_WRITE_NOSYNC option. This is just a thin PUBLIC wrapper
1649 * for __log_write that is similar to __log_flush_commit.
1650 *
1651 * Note that the REP->mtx_clientdb should be held when this is called.
1652 * Note that we acquire the log region mutex while holding mtx_clientdb.
1653 *
1654 * PUBLIC: int __log_rep_write __P((ENV *));
1655 */
1656 int
__log_rep_write(env)1657 __log_rep_write(env)
1658 ENV *env;
1659 {
1660 DB_LOG *dblp;
1661 LOG *lp;
1662 int ret;
1663
1664 dblp = env->lg_handle;
1665 lp = dblp->reginfo.primary;
1666 ret = 0;
1667 LOG_SYSTEM_LOCK(env);
1668 if (!lp->db_log_inmemory && lp->b_off != 0)
1669 if ((ret = __log_write(dblp, dblp->bufp,
1670 (u_int32_t)lp->b_off)) == 0)
1671 lp->b_off = 0;
1672 LOG_SYSTEM_UNLOCK(env);
1673 return (ret);
1674 }
1675
1676 static int
__log_encrypt_record(env,dbt,hdr,orig)1677 __log_encrypt_record(env, dbt, hdr, orig)
1678 ENV *env;
1679 DBT *dbt;
1680 HDR *hdr;
1681 u_int32_t orig;
1682 {
1683 DB_CIPHER *db_cipher;
1684 int ret;
1685
1686 if (CRYPTO_ON(env)) {
1687 db_cipher = env->crypto_handle;
1688 hdr->size = HDR_CRYPTO_SZ;
1689 hdr->orig_size = orig;
1690 if ((ret = db_cipher->encrypt(env, db_cipher->data,
1691 hdr->iv, dbt->data, dbt->size)) != 0)
1692 return (ret);
1693 } else {
1694 hdr->size = HDR_NORMAL_SZ;
1695 }
1696 return (0);
1697 }
1698 /*
1699 * __log_put_record_pp --
1700 * DB_ENV->log_put_record pre/post processing.
1701 *
1702 * PUBLIC: int __log_put_record_pp __P((DB_ENV *, DB *, DB_TXN *, DB_LSN *,
1703 * PUBLIC: u_int32_t, u_int32_t, u_int32_t, u_int32_t,
1704 * PUBLIC: DB_LOG_RECSPEC *, ...));
1705 */
1706 int
__log_put_record_pp(DB_ENV * dbenv,DB * dbp,DB_TXN * txnp,DB_LSN * ret_lsnp,u_int32_t flags,u_int32_t rectype,u_int32_t has_data,u_int32_t size,DB_LOG_RECSPEC * spec,...)1707 __log_put_record_pp(DB_ENV *dbenv, DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp,
1708 u_int32_t flags, u_int32_t rectype, u_int32_t has_data, u_int32_t size,
1709 DB_LOG_RECSPEC *spec, ...)
1710 {
1711 DB_THREAD_INFO *ip;
1712 ENV *env;
1713 va_list argp;
1714 int ret;
1715
1716 env = dbenv->env;
1717
1718 ENV_REQUIRES_CONFIG(env,
1719 env->lg_handle, "DB_ENV->log_put_record", DB_INIT_LOG);
1720
1721 /* Validate arguments: check for allowed flags. */
1722 if ((ret = __db_fchk(env, "DB_ENV->log_put_record", flags,
1723 DB_LOG_CHKPNT | DB_LOG_COMMIT |
1724 DB_FLUSH | DB_LOG_NOCOPY | DB_LOG_WRNOSYNC)) != 0)
1725 return (ret);
1726
1727 /* DB_LOG_WRNOSYNC and DB_FLUSH are mutually exclusive. */
1728 if (LF_ISSET(DB_LOG_WRNOSYNC) && LF_ISSET(DB_FLUSH))
1729 return (__db_ferr(env, "DB_ENV->log_put_record", 1));
1730
1731 /* Replication clients should never write log records. */
1732 if (IS_REP_CLIENT(env)) {
1733 __db_errx(env, DB_STR("2511",
1734 "DB_ENV->log_put is illegal on replication clients"));
1735 return (EINVAL);
1736 }
1737
1738 ENV_ENTER(env, ip);
1739 va_start(argp, spec);
1740 REPLICATION_WRAP(env, (__log_put_record_int(env, dbp,
1741 txnp, ret_lsnp, flags, rectype, has_data, size, spec, argp)),
1742 0, ret);
1743 va_end(argp);
1744 ENV_LEAVE(env, ip);
1745 return (ret);
1746 }
1747
1748 /*
1749 * PUBLIC: int __log_put_record __P((ENV *, DB *, DB_TXN *, DB_LSN *,
1750 * PUBLIC: u_int32_t, u_int32_t, u_int32_t, u_int32_t,
1751 * PUBLIC: DB_LOG_RECSPEC *, ...));
1752 */
1753 int
__log_put_record(ENV * env,DB * dbp,DB_TXN * txnp,DB_LSN * ret_lsnp,u_int32_t flags,u_int32_t rectype,u_int32_t has_data,u_int32_t size,DB_LOG_RECSPEC * spec,...)1754 __log_put_record(ENV *env, DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp,
1755 u_int32_t flags, u_int32_t rectype, u_int32_t has_data, u_int32_t size,
1756 DB_LOG_RECSPEC *spec, ...)
1757 {
1758 va_list argp;
1759 int ret;
1760
1761 va_start(argp, spec);
1762 ret = __log_put_record_int(env, dbp, txnp, ret_lsnp, flags,
1763 rectype, has_data, size, spec, argp);
1764 va_end(argp);
1765 return (ret);
1766 }
1767
1768 static int
__log_put_record_int(ENV * env,DB * dbp,DB_TXN * txnp,DB_LSN * ret_lsnp,u_int32_t flags,u_int32_t rectype,u_int32_t has_data,u_int32_t size,DB_LOG_RECSPEC * spec,va_list argp)1769 __log_put_record_int(ENV *env, DB *dbp, DB_TXN *txnp, DB_LSN *ret_lsnp,
1770 u_int32_t flags, u_int32_t rectype, u_int32_t has_data, u_int32_t size,
1771 DB_LOG_RECSPEC *spec, va_list argp)
1772 {
1773 DBT *data, *dbt, *header, logrec;
1774 DB_LOG_RECSPEC *sp;
1775 DB_LSN *lsnp, lsn, null_lsn, *pagelsn, *rlsnp;
1776 DB_TXNLOGREC *lr;
1777 LOG *lp;
1778 PAGE *pghdrstart;
1779 u_int64_t ulltmp;
1780 u_int32_t hdrsize, op, zero, uinttmp, txn_num;
1781 u_int npad;
1782 u_int8_t *bp;
1783 int is_durable, ret;
1784 void *hdrstart;
1785
1786 COMPQUIET(lr, NULL);
1787 COMPQUIET(hdrsize, 0);
1788 COMPQUIET(op, 0);
1789 COMPQUIET(hdrstart, NULL);
1790 COMPQUIET(pghdrstart, NULL);
1791 COMPQUIET(header, NULL);
1792
1793 /*
1794 * rlsnp will be stored into while holding the log system lock.
1795 * If this is a commit record then ret_lsnp will be the address of
1796 * the transaction detail visible_lsn field. If not then this
1797 * may be the lsn of a page and we do not want to set it if
1798 * the log_put fails after writing the record (due to an I/O error).
1799 */
1800 if (LF_ISSET(DB_LOG_COMMIT))
1801 rlsnp = ret_lsnp;
1802 else
1803 rlsnp = &lsn;
1804 npad = 0;
1805 ret = 0;
1806 data = NULL;
1807
1808 if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
1809 (dbp != NULL && F_ISSET(dbp, DB_AM_NOT_DURABLE))) {
1810 if (txnp == NULL)
1811 return (0);
1812 is_durable = 0;
1813 } else
1814 is_durable = 1;
1815
1816 if (txnp == NULL) {
1817 txn_num = 0;
1818 lsnp = &null_lsn;
1819 null_lsn.file = null_lsn.offset = 0;
1820 } else {
1821 if (TAILQ_FIRST(&txnp->kids) != NULL &&
1822 (ret = __txn_activekids(env, rectype, txnp)) != 0)
1823 return (ret);
1824 /*
1825 * We need to assign begin_lsn while holding region mutex.
1826 * That assignment is done inside the __log_put call,
1827 * so pass in the appropriate memory location to be filled
1828 * in by the log_put code.
1829 */
1830 DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
1831 txn_num = txnp->txnid;
1832 }
1833
1834 if (dbp != NULL) {
1835 DB_ASSERT(env, dbp->log_filename != NULL);
1836 if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
1837 (ret = __dbreg_lazy_id(dbp)) != 0)
1838 return (ret);
1839 }
1840
1841 logrec.size = size;
1842
1843 if (CRYPTO_ON(env)) {
1844 npad = env->crypto_handle->adj_size(logrec.size);
1845 logrec.size += npad;
1846 }
1847
1848 if (is_durable || txnp == NULL) {
1849 if ((ret = __os_malloc(env, logrec.size, &logrec.data)) != 0)
1850 return (ret);
1851 } else {
1852 if ((ret = __os_malloc(env,
1853 logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
1854 return (ret);
1855 #ifdef DIAGNOSTIC
1856 if ((ret =
1857 __os_malloc(env, logrec.size, &logrec.data)) != 0) {
1858 __os_free(env, lr);
1859 return (ret);
1860 }
1861 #else
1862 logrec.data = lr->data;
1863 #endif
1864 }
1865 if (npad > 0)
1866 memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
1867
1868 bp = logrec.data;
1869
1870 LOGCOPY_32(env, bp, &rectype);
1871 bp += sizeof(rectype);
1872
1873 LOGCOPY_32(env, bp, &txn_num);
1874 bp += sizeof(txn_num);
1875
1876 LOGCOPY_FROMLSN(env, bp, lsnp);
1877 bp += sizeof(DB_LSN);
1878
1879 zero = 0;
1880 lp = env->lg_handle->reginfo.primary;
1881 for (sp = spec; sp->type != LOGREC_Done; sp++) {
1882 switch (sp->type) {
1883 case LOGREC_DB:
1884 /* This is not in the varargs. */
1885 uinttmp = (u_int32_t)dbp->log_filename->id;
1886 LOGCOPY_32(env, bp, &uinttmp);
1887 bp += sizeof(uinttmp);
1888 break;
1889
1890 case LOGREC_ARG:
1891 case LOGREC_TIME:
1892 case LOGREC_DBOP:
1893 uinttmp = va_arg(argp, u_int32_t);
1894 LOGCOPY_32(env, bp, &uinttmp);
1895 bp += sizeof(uinttmp);
1896 break;
1897 case LOGREC_LONGARG:
1898 ulltmp = va_arg(argp, u_int64_t);
1899 LOGCOPY_64(env, bp, &ulltmp);
1900 bp += sizeof(ulltmp);
1901 break;
1902 case LOGREC_OP:
1903 op = va_arg(argp, u_int32_t);
1904 LOGCOPY_32(env, bp, &op);
1905 bp += sizeof(op);
1906 break;
1907 case LOGREC_DBT:
1908 case LOGREC_PGLIST:
1909 case LOGREC_LOCKS:
1910 case LOGREC_HDR:
1911 case LOGREC_DATA:
1912 dbt = va_arg(argp, DBT *);
1913 if (dbt == NULL) {
1914 LOGCOPY_32(env, bp, &zero);
1915 bp += sizeof(u_int32_t);
1916 } else {
1917 LOGCOPY_32(env, bp, &dbt->size);
1918 bp += sizeof(dbt->size);
1919 memcpy(bp, dbt->data, dbt->size);
1920 }
1921 /* Process fields that need to be byte swapped. */
1922 if (dbp != NULL && F_ISSET(dbp, DB_AM_SWAP)) {
1923 if (sp->type == LOGREC_HDR &&
1924 dbt != NULL && has_data == 0)
1925 __db_recordswap(op,
1926 dbt->size, bp, NULL, 0);
1927 else if (sp->type == LOGREC_HDR) {
1928 hdrstart = bp;
1929 hdrsize = dbt == NULL ? 0 : dbt->size;
1930 } else if (sp->type == LOGREC_DATA) {
1931 __db_recordswap(op,
1932 hdrsize, hdrstart, bp, 0);
1933 has_data = 0;
1934 }
1935 }
1936 if (dbt != NULL)
1937 bp += dbt->size;
1938
1939 break;
1940 /*
1941 * Page header and data -- we assume that the header
1942 * is listed first and the data follows sometime later.
1943 * There should be only one header/data pair per record.
1944 */
1945 case LOGREC_PGDBT:
1946 header = va_arg(argp, DBT *);
1947 if (header == NULL) {
1948 LOGCOPY_32(env, bp, &zero);
1949 bp += sizeof(u_int32_t);
1950 } else {
1951 LOGCOPY_32(env, bp, &header->size);
1952 bp += sizeof(header->size);
1953 pghdrstart = (PAGE *)bp;
1954 memcpy(bp, header->data, header->size);
1955 if (has_data == 0 &&
1956 F_ISSET(dbp, DB_AM_SWAP) &&
1957 (ret = __db_pageswap(
1958 env, dbp, pghdrstart, (size_t)header->size,
1959 NULL, 0)) != 0)
1960 return (ret);
1961 bp += header->size;
1962 }
1963 break;
1964
1965 case LOGREC_PGDDBT:
1966 data = va_arg(argp, DBT *);
1967 if (data == NULL) {
1968 zero = 0;
1969 LOGCOPY_32(env, bp, &zero);
1970 bp += sizeof(u_int32_t);
1971 } else {
1972 if (F_ISSET(dbp, DB_AM_SWAP) &&
1973 (ret = __db_pageswap(env, dbp, pghdrstart,
1974 (size_t)header->size, (DBT *)data, 0)) != 0)
1975 return (ret);
1976 LOGCOPY_32(env, bp, &data->size);
1977 bp += sizeof(data->size);
1978 memcpy(bp, data->data, data->size);
1979 if (F_ISSET(dbp, DB_AM_SWAP) &&
1980 F_ISSET(data, DB_DBT_APPMALLOC))
1981 __os_free(env, data->data);
1982 bp += data->size;
1983 }
1984 break;
1985 case LOGREC_POINTER:
1986 pagelsn = va_arg(argp, DB_LSN *);
1987 if (pagelsn != NULL) {
1988 if (txnp != NULL) {
1989 if (LOG_COMPARE(pagelsn,
1990 &lp->lsn) >= 0 && (ret =
1991 __log_check_page_lsn(env,
1992 dbp, pagelsn)) != 0)
1993 return (ret);
1994 }
1995 LOGCOPY_FROMLSN(env, bp, pagelsn);
1996 } else
1997 memset(bp, 0, sizeof(*pagelsn));
1998 bp += sizeof(*pagelsn);
1999 break;
2000
2001 default:
2002 DB_ASSERT(env, sp->type != sp->type);
2003 }
2004 }
2005
2006 DB_ASSERT(env,
2007 (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
2008
2009 if (is_durable || txnp == NULL) {
2010 if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
2011 flags | DB_LOG_NOCOPY)) == 0) {
2012 if (txnp != NULL)
2013 *lsnp = *rlsnp;
2014 *ret_lsnp = *rlsnp;
2015 }
2016 } else {
2017 ret = 0;
2018 #ifdef DIAGNOSTIC
2019 /*
2020 * Set the debug bit if we are going to log non-durable
2021 * transactions so they will be ignored by recovery.
2022 */
2023 memcpy(lr->data, logrec.data, logrec.size);
2024 rectype |= DB_debug_FLAG;
2025 LOGCOPY_32(env, logrec.data, &rectype);
2026
2027 if (!IS_REP_CLIENT(env) && !lp->db_log_inmemory)
2028 ret = __log_put(env,
2029 rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
2030 #endif
2031 STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
2032 F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
2033 LSN_NOT_LOGGED(*ret_lsnp);
2034 }
2035
2036 #ifdef LOG_DIAGNOSTIC
2037 if (ret != 0)
2038 (void)__db_addrem_print(env,
2039 (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
2040 #endif
2041
2042 #ifdef DIAGNOSTIC
2043 __os_free(env, logrec.data);
2044 #else
2045 if (is_durable || txnp == NULL)
2046 __os_free(env, logrec.data);
2047 #endif
2048 return (ret);
2049 }
2050