1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * SPDX-License-Identifier: MPL-2.0
5  *
6  * This Source Code Form is subject to the terms of the Mozilla Public
7  * License, v. 2.0. If a copy of the MPL was not distributed with this
8  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
9  *
10  * See the COPYRIGHT file distributed with this work for additional
11  * information regarding copyright ownership.
12  */
13 
14 #include <errno.h>
15 #include <inttypes.h>
16 #include <stdbool.h>
17 #include <stdlib.h>
18 #include <unistd.h>
19 
20 #include <isc/file.h>
21 #include <isc/mem.h>
22 #include <isc/print.h>
23 #include <isc/serial.h>
24 #include <isc/stdio.h>
25 #include <isc/string.h>
26 #include <isc/util.h>
27 
28 #include <dns/compress.h>
29 #include <dns/db.h>
30 #include <dns/dbiterator.h>
31 #include <dns/diff.h>
32 #include <dns/fixedname.h>
33 #include <dns/journal.h>
34 #include <dns/log.h>
35 #include <dns/rdataset.h>
36 #include <dns/rdatasetiter.h>
37 #include <dns/result.h>
38 #include <dns/soa.h>
39 
40 /*! \file
41  * \brief Journaling.
42  *
43  * A journal file consists of
44  *
45  *   \li A fixed-size header of type journal_rawheader_t.
46  *
47  *   \li The index.  This is an unordered array of index entries
48  *     of type journal_rawpos_t giving the locations
49  *     of some arbitrary subset of the journal's addressable
50  *     transactions.  The index entries are used as hints to
51  *     speed up the process of locating a transaction with a given
52  *     serial number.  Unused index entries have an "offset"
53  *     field of zero.  The size of the index can vary between
54  *     journal files, but does not change during the lifetime
55  *     of a file.  The size can be zero.
56  *
57  *   \li The journal data.  This  consists of one or more transactions.
58  *     Each transaction begins with a transaction header of type
59  *     journal_rawxhdr_t.  The transaction header is followed by a
60  *     sequence of RRs, similar in structure to an IXFR difference
61  *     sequence (RFC1995).  That is, the pre-transaction SOA,
62  *     zero or more other deleted RRs, the post-transaction SOA,
63  *     and zero or more other added RRs.  Unlike in IXFR, each RR
64  *     is prefixed with a 32-bit length.
65  *
66  *     The journal data part grows as new transactions are
67  *     appended to the file.  Only those transactions
68  *     whose serial number is current-(2^31-1) to current
69  *     are considered "addressable" and may be pointed
70  *     to from the header or index.  They may be preceded
71  *     by old transactions that are no longer addressable,
72  *     and they may be followed by transactions that were
73  *     appended to the journal but never committed by updating
74  *     the "end" position in the header.  The latter will
75  *     be overwritten when new transactions are added.
76  */
77 
78 /**************************************************************************/
79 /*
80  * Miscellaneous utilities.
81  */
82 
83 #define JOURNAL_COMMON_LOGARGS \
84 	dns_lctx, DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL
85 
86 #define JOURNAL_DEBUG_LOGARGS(n) JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(n)
87 
88 /*%
89  * It would be non-sensical (or at least obtuse) to use FAIL() with an
90  * ISC_R_SUCCESS code, but the test is there to keep the Solaris compiler
91  * from complaining about "end-of-loop code not reached".
92  */
93 #define FAIL(code)                           \
94 	do {                                 \
95 		result = (code);             \
96 		if (result != ISC_R_SUCCESS) \
97 			goto failure;        \
98 	} while (0)
99 
100 #define CHECK(op)                            \
101 	do {                                 \
102 		result = (op);               \
103 		if (result != ISC_R_SUCCESS) \
104 			goto failure;        \
105 	} while (0)
106 
107 #define JOURNAL_SERIALSET 0x01U
108 
109 static isc_result_t
110 index_to_disk(dns_journal_t *);
111 
112 static inline uint32_t
decode_uint32(unsigned char * p)113 decode_uint32(unsigned char *p) {
114 	return (((uint32_t)p[0] << 24) + ((uint32_t)p[1] << 16) +
115 		((uint32_t)p[2] << 8) + ((uint32_t)p[3] << 0));
116 }
117 
118 static inline void
encode_uint32(uint32_t val,unsigned char * p)119 encode_uint32(uint32_t val, unsigned char *p) {
120 	p[0] = (uint8_t)(val >> 24);
121 	p[1] = (uint8_t)(val >> 16);
122 	p[2] = (uint8_t)(val >> 8);
123 	p[3] = (uint8_t)(val >> 0);
124 }
125 
126 isc_result_t
dns_db_createsoatuple(dns_db_t * db,dns_dbversion_t * ver,isc_mem_t * mctx,dns_diffop_t op,dns_difftuple_t ** tp)127 dns_db_createsoatuple(dns_db_t *db, dns_dbversion_t *ver, isc_mem_t *mctx,
128 		      dns_diffop_t op, dns_difftuple_t **tp) {
129 	isc_result_t result;
130 	dns_dbnode_t *node;
131 	dns_rdataset_t rdataset;
132 	dns_rdata_t rdata = DNS_RDATA_INIT;
133 	dns_fixedname_t fixed;
134 	dns_name_t *zonename;
135 
136 	zonename = dns_fixedname_initname(&fixed);
137 	dns_name_copynf(dns_db_origin(db), zonename);
138 
139 	node = NULL;
140 	result = dns_db_findnode(db, zonename, false, &node);
141 	if (result != ISC_R_SUCCESS) {
142 		goto nonode;
143 	}
144 
145 	dns_rdataset_init(&rdataset);
146 	result = dns_db_findrdataset(db, node, ver, dns_rdatatype_soa, 0,
147 				     (isc_stdtime_t)0, &rdataset, NULL);
148 	if (result != ISC_R_SUCCESS) {
149 		goto freenode;
150 	}
151 
152 	result = dns_rdataset_first(&rdataset);
153 	if (result != ISC_R_SUCCESS) {
154 		goto freenode;
155 	}
156 
157 	dns_rdataset_current(&rdataset, &rdata);
158 	dns_rdataset_getownercase(&rdataset, zonename);
159 
160 	result = dns_difftuple_create(mctx, op, zonename, rdataset.ttl, &rdata,
161 				      tp);
162 
163 	dns_rdataset_disassociate(&rdataset);
164 	dns_db_detachnode(db, &node);
165 	return (result);
166 
167 freenode:
168 	dns_db_detachnode(db, &node);
169 nonode:
170 	UNEXPECTED_ERROR(__FILE__, __LINE__, "missing SOA");
171 	return (result);
172 }
173 
174 /* Journaling */
175 
176 /*%
177  * On-disk representation of a "pointer" to a journal entry.
178  * These are used in the journal header to locate the beginning
179  * and end of the journal, and in the journal index to locate
180  * other transactions.
181  */
182 typedef struct {
183 	unsigned char serial[4]; /*%< SOA serial before update. */
184 	/*
185 	 * XXXRTH  Should offset be 8 bytes?
186 	 * XXXDCL ... probably, since isc_offset_t is 8 bytes on many OSs.
187 	 * XXXAG  ... but we will not be able to seek >2G anyway on many
188 	 *            platforms as long as we are using fseek() rather
189 	 *            than lseek().
190 	 */
191 	unsigned char offset[4]; /*%< Offset from beginning of file. */
192 } journal_rawpos_t;
193 
194 /*%
195  * The header is of a fixed size, with some spare room for future
196  * extensions.
197  */
198 #define JOURNAL_HEADER_SIZE 64 /* Bytes. */
199 
200 typedef enum {
201 	XHDR_VERSION1 = 1,
202 	XHDR_VERSION2 = 2,
203 } xhdr_version_t;
204 
205 /*%
206  * The on-disk representation of the journal header.
207  * All numbers are stored in big-endian order.
208  */
209 typedef union {
210 	struct {
211 		/*% File format version ID. */
212 		unsigned char format[16];
213 		/*% Position of the first addressable transaction */
214 		journal_rawpos_t begin;
215 		/*% Position of the next (yet nonexistent) transaction. */
216 		journal_rawpos_t end;
217 		/*% Number of index entries following the header. */
218 		unsigned char index_size[4];
219 		/*% Source serial number. */
220 		unsigned char sourceserial[4];
221 		unsigned char flags;
222 	} h;
223 	/* Pad the header to a fixed size. */
224 	unsigned char pad[JOURNAL_HEADER_SIZE];
225 } journal_rawheader_t;
226 
227 /*%
228  * The on-disk representation of the transaction header, version 2.
229  * There is one of these at the beginning of each transaction.
230  */
231 typedef struct {
232 	unsigned char size[4];	  /*%< In bytes, excluding header. */
233 	unsigned char count[4];	  /*%< Number of records in transaction */
234 	unsigned char serial0[4]; /*%< SOA serial before update. */
235 	unsigned char serial1[4]; /*%< SOA serial after update. */
236 } journal_rawxhdr_t;
237 
238 /*%
239  * Old-style raw transaction header, version 1, used for backward
240  * compatibility mode.
241  */
242 typedef struct {
243 	unsigned char size[4];
244 	unsigned char serial0[4];
245 	unsigned char serial1[4];
246 } journal_rawxhdr_ver1_t;
247 
248 /*%
249  * The on-disk representation of the RR header.
250  * There is one of these at the beginning of each RR.
251  */
252 typedef struct {
253 	unsigned char size[4]; /*%< In bytes, excluding header. */
254 } journal_rawrrhdr_t;
255 
256 /*%
257  * The in-core representation of the journal header.
258  */
259 typedef struct {
260 	uint32_t serial;
261 	isc_offset_t offset;
262 } journal_pos_t;
263 
264 #define POS_VALID(pos)	    ((pos).offset != 0)
265 #define POS_INVALIDATE(pos) ((pos).offset = 0, (pos).serial = 0)
266 
267 typedef struct {
268 	unsigned char format[16];
269 	journal_pos_t begin;
270 	journal_pos_t end;
271 	uint32_t index_size;
272 	uint32_t sourceserial;
273 	bool serialset;
274 } journal_header_t;
275 
276 /*%
277  * The in-core representation of the transaction header.
278  */
279 typedef struct {
280 	uint32_t size;
281 	uint32_t count;
282 	uint32_t serial0;
283 	uint32_t serial1;
284 } journal_xhdr_t;
285 
286 /*%
287  * The in-core representation of the RR header.
288  */
289 typedef struct {
290 	uint32_t size;
291 } journal_rrhdr_t;
292 
293 /*%
294  * Initial contents to store in the header of a newly created
295  * journal file.
296  *
297  * The header starts with the magic string ";BIND LOG V9.2\n"
298  * to identify the file as a BIND 9 journal file.  An ASCII
299  * identification string is used rather than a binary magic
300  * number to be consistent with BIND 8 (BIND 8 journal files
301  * are ASCII text files).
302  */
303 
304 static journal_header_t journal_header_ver1 = {
305 	";BIND LOG V9\n", { 0, 0 }, { 0, 0 }, 0, 0, 0
306 };
307 static journal_header_t initial_journal_header = {
308 	";BIND LOG V9.2\n", { 0, 0 }, { 0, 0 }, 0, 0, 0
309 };
310 
311 #define JOURNAL_EMPTY(h) ((h)->begin.offset == (h)->end.offset)
312 
313 typedef enum {
314 	JOURNAL_STATE_INVALID,
315 	JOURNAL_STATE_READ,
316 	JOURNAL_STATE_WRITE,
317 	JOURNAL_STATE_TRANSACTION,
318 	JOURNAL_STATE_INLINE
319 } journal_state_t;
320 
321 struct dns_journal {
322 	unsigned int magic; /*%< JOUR */
323 	isc_mem_t *mctx;    /*%< Memory context */
324 	journal_state_t state;
325 	xhdr_version_t xhdr_version; /*%< Expected transaction header version */
326 	bool header_ver1;	     /*%< Transaction header compatibility
327 				      *   mode is allowed */
328 	bool recovered;		     /*%< A recoverable error was found
329 				      *   while reading the journal */
330 	char *filename;		     /*%< Journal file name */
331 	FILE *fp;		     /*%< File handle */
332 	isc_offset_t offset;	     /*%< Current file offset */
333 	journal_xhdr_t curxhdr;	     /*%< Current transaction header */
334 	journal_header_t header;     /*%< In-core journal header */
335 	unsigned char *rawindex;     /*%< In-core buffer for journal index
336 				      * in on-disk format */
337 	journal_pos_t *index;	     /*%< In-core journal index */
338 
339 	/*% Current transaction state (when writing). */
340 	struct {
341 		unsigned int n_soa;   /*%< Number of SOAs seen */
342 		unsigned int n_rr;    /*%< Number of RRs to write */
343 		journal_pos_t pos[2]; /*%< Begin/end position */
344 	} x;
345 
346 	/*% Iteration state (when reading). */
347 	struct {
348 		/* These define the part of the journal we iterate over. */
349 		journal_pos_t bpos; /*%< Position before first, */
350 		journal_pos_t cpos; /*%< before current, */
351 		journal_pos_t epos; /*%< and after last transaction */
352 		/* The rest is iterator state. */
353 		uint32_t current_serial; /*%< Current SOA serial */
354 		isc_buffer_t source;	 /*%< Data from disk */
355 		isc_buffer_t target;	 /*%< Data from _fromwire check */
356 		dns_decompress_t dctx;	 /*%< Dummy decompression ctx */
357 		dns_name_t name;	 /*%< Current domain name */
358 		dns_rdata_t rdata;	 /*%< Current rdata */
359 		uint32_t ttl;		 /*%< Current TTL */
360 		unsigned int xsize;	 /*%< Size of transaction data */
361 		unsigned int xpos;	 /*%< Current position in it */
362 		isc_result_t result;	 /*%< Result of last call */
363 	} it;
364 };
365 
366 #define DNS_JOURNAL_MAGIC    ISC_MAGIC('J', 'O', 'U', 'R')
367 #define DNS_JOURNAL_VALID(t) ISC_MAGIC_VALID(t, DNS_JOURNAL_MAGIC)
368 
369 static void
journal_pos_decode(journal_rawpos_t * raw,journal_pos_t * cooked)370 journal_pos_decode(journal_rawpos_t *raw, journal_pos_t *cooked) {
371 	cooked->serial = decode_uint32(raw->serial);
372 	cooked->offset = decode_uint32(raw->offset);
373 }
374 
375 static void
journal_pos_encode(journal_rawpos_t * raw,journal_pos_t * cooked)376 journal_pos_encode(journal_rawpos_t *raw, journal_pos_t *cooked) {
377 	encode_uint32(cooked->serial, raw->serial);
378 	encode_uint32(cooked->offset, raw->offset);
379 }
380 
381 static void
journal_header_decode(journal_rawheader_t * raw,journal_header_t * cooked)382 journal_header_decode(journal_rawheader_t *raw, journal_header_t *cooked) {
383 	INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
384 
385 	memmove(cooked->format, raw->h.format, sizeof(cooked->format));
386 	journal_pos_decode(&raw->h.begin, &cooked->begin);
387 	journal_pos_decode(&raw->h.end, &cooked->end);
388 	cooked->index_size = decode_uint32(raw->h.index_size);
389 	cooked->sourceserial = decode_uint32(raw->h.sourceserial);
390 	cooked->serialset = ((raw->h.flags & JOURNAL_SERIALSET) != 0);
391 }
392 
393 static void
journal_header_encode(journal_header_t * cooked,journal_rawheader_t * raw)394 journal_header_encode(journal_header_t *cooked, journal_rawheader_t *raw) {
395 	unsigned char flags = 0;
396 
397 	INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
398 
399 	memset(raw->pad, 0, sizeof(raw->pad));
400 	memmove(raw->h.format, cooked->format, sizeof(raw->h.format));
401 	journal_pos_encode(&raw->h.begin, &cooked->begin);
402 	journal_pos_encode(&raw->h.end, &cooked->end);
403 	encode_uint32(cooked->index_size, raw->h.index_size);
404 	encode_uint32(cooked->sourceserial, raw->h.sourceserial);
405 	if (cooked->serialset) {
406 		flags |= JOURNAL_SERIALSET;
407 	}
408 	raw->h.flags = flags;
409 }
410 
411 /*
412  * Journal file I/O subroutines, with error checking and reporting.
413  */
414 static isc_result_t
journal_seek(dns_journal_t * j,uint32_t offset)415 journal_seek(dns_journal_t *j, uint32_t offset) {
416 	isc_result_t result;
417 
418 	result = isc_stdio_seek(j->fp, (off_t)offset, SEEK_SET);
419 	if (result != ISC_R_SUCCESS) {
420 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
421 			      "%s: seek: %s", j->filename,
422 			      isc_result_totext(result));
423 		return (ISC_R_UNEXPECTED);
424 	}
425 	j->offset = offset;
426 	return (ISC_R_SUCCESS);
427 }
428 
429 static isc_result_t
journal_read(dns_journal_t * j,void * mem,size_t nbytes)430 journal_read(dns_journal_t *j, void *mem, size_t nbytes) {
431 	isc_result_t result;
432 
433 	result = isc_stdio_read(mem, 1, nbytes, j->fp, NULL);
434 	if (result != ISC_R_SUCCESS) {
435 		if (result == ISC_R_EOF) {
436 			return (ISC_R_NOMORE);
437 		}
438 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
439 			      "%s: read: %s", j->filename,
440 			      isc_result_totext(result));
441 		return (ISC_R_UNEXPECTED);
442 	}
443 	j->offset += (isc_offset_t)nbytes;
444 	return (ISC_R_SUCCESS);
445 }
446 
447 static isc_result_t
journal_write(dns_journal_t * j,void * mem,size_t nbytes)448 journal_write(dns_journal_t *j, void *mem, size_t nbytes) {
449 	isc_result_t result;
450 
451 	result = isc_stdio_write(mem, 1, nbytes, j->fp, NULL);
452 	if (result != ISC_R_SUCCESS) {
453 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
454 			      "%s: write: %s", j->filename,
455 			      isc_result_totext(result));
456 		return (ISC_R_UNEXPECTED);
457 	}
458 	j->offset += (isc_offset_t)nbytes;
459 	return (ISC_R_SUCCESS);
460 }
461 
462 static isc_result_t
journal_fsync(dns_journal_t * j)463 journal_fsync(dns_journal_t *j) {
464 	isc_result_t result;
465 
466 	result = isc_stdio_flush(j->fp);
467 	if (result != ISC_R_SUCCESS) {
468 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
469 			      "%s: flush: %s", j->filename,
470 			      isc_result_totext(result));
471 		return (ISC_R_UNEXPECTED);
472 	}
473 	result = isc_stdio_sync(j->fp);
474 	if (result != ISC_R_SUCCESS) {
475 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
476 			      "%s: fsync: %s", j->filename,
477 			      isc_result_totext(result));
478 		return (ISC_R_UNEXPECTED);
479 	}
480 	return (ISC_R_SUCCESS);
481 }
482 
483 /*
484  * Read/write a transaction header at the current file position.
485  */
486 static isc_result_t
journal_read_xhdr(dns_journal_t * j,journal_xhdr_t * xhdr)487 journal_read_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr) {
488 	isc_result_t result;
489 
490 	j->it.cpos.offset = j->offset;
491 
492 	switch (j->xhdr_version) {
493 	case XHDR_VERSION1: {
494 		journal_rawxhdr_ver1_t raw;
495 		result = journal_read(j, &raw, sizeof(raw));
496 		if (result != ISC_R_SUCCESS) {
497 			return (result);
498 		}
499 		xhdr->size = decode_uint32(raw.size);
500 		xhdr->count = 0;
501 		xhdr->serial0 = decode_uint32(raw.serial0);
502 		xhdr->serial1 = decode_uint32(raw.serial1);
503 		j->curxhdr = *xhdr;
504 		return (ISC_R_SUCCESS);
505 	}
506 
507 	case XHDR_VERSION2: {
508 		journal_rawxhdr_t raw;
509 		result = journal_read(j, &raw, sizeof(raw));
510 		if (result != ISC_R_SUCCESS) {
511 			return (result);
512 		}
513 		xhdr->size = decode_uint32(raw.size);
514 		xhdr->count = decode_uint32(raw.count);
515 		xhdr->serial0 = decode_uint32(raw.serial0);
516 		xhdr->serial1 = decode_uint32(raw.serial1);
517 		j->curxhdr = *xhdr;
518 		return (ISC_R_SUCCESS);
519 	}
520 
521 	default:
522 		return (ISC_R_NOTIMPLEMENTED);
523 	}
524 }
525 
526 static isc_result_t
journal_write_xhdr(dns_journal_t * j,uint32_t size,uint32_t count,uint32_t serial0,uint32_t serial1)527 journal_write_xhdr(dns_journal_t *j, uint32_t size, uint32_t count,
528 		   uint32_t serial0, uint32_t serial1) {
529 	if (j->header_ver1) {
530 		journal_rawxhdr_ver1_t raw;
531 		encode_uint32(size, raw.size);
532 		encode_uint32(serial0, raw.serial0);
533 		encode_uint32(serial1, raw.serial1);
534 		return (journal_write(j, &raw, sizeof(raw)));
535 	} else {
536 		journal_rawxhdr_t raw;
537 		encode_uint32(size, raw.size);
538 		encode_uint32(count, raw.count);
539 		encode_uint32(serial0, raw.serial0);
540 		encode_uint32(serial1, raw.serial1);
541 		return (journal_write(j, &raw, sizeof(raw)));
542 	}
543 }
544 
545 /*
546  * Read an RR header at the current file position.
547  */
548 
549 static isc_result_t
journal_read_rrhdr(dns_journal_t * j,journal_rrhdr_t * rrhdr)550 journal_read_rrhdr(dns_journal_t *j, journal_rrhdr_t *rrhdr) {
551 	journal_rawrrhdr_t raw;
552 	isc_result_t result;
553 
554 	result = journal_read(j, &raw, sizeof(raw));
555 	if (result != ISC_R_SUCCESS) {
556 		return (result);
557 	}
558 	rrhdr->size = decode_uint32(raw.size);
559 	return (ISC_R_SUCCESS);
560 }
561 
562 static isc_result_t
journal_file_create(isc_mem_t * mctx,bool downgrade,const char * filename)563 journal_file_create(isc_mem_t *mctx, bool downgrade, const char *filename) {
564 	FILE *fp = NULL;
565 	isc_result_t result;
566 	journal_header_t header;
567 	journal_rawheader_t rawheader;
568 	int index_size = 56; /* XXX configurable */
569 	int size;
570 	void *mem = NULL; /* Memory for temporary index image. */
571 
572 	INSIST(sizeof(journal_rawheader_t) == JOURNAL_HEADER_SIZE);
573 
574 	result = isc_stdio_open(filename, "wb", &fp);
575 	if (result != ISC_R_SUCCESS) {
576 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
577 			      "%s: create: %s", filename,
578 			      isc_result_totext(result));
579 		return (ISC_R_UNEXPECTED);
580 	}
581 
582 	if (downgrade) {
583 		header = journal_header_ver1;
584 	} else {
585 		header = initial_journal_header;
586 	}
587 	header.index_size = index_size;
588 	journal_header_encode(&header, &rawheader);
589 
590 	size = sizeof(journal_rawheader_t) +
591 	       index_size * sizeof(journal_rawpos_t);
592 
593 	mem = isc_mem_get(mctx, size);
594 	memset(mem, 0, size);
595 	memmove(mem, &rawheader, sizeof(rawheader));
596 
597 	result = isc_stdio_write(mem, 1, (size_t)size, fp, NULL);
598 	if (result != ISC_R_SUCCESS) {
599 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
600 			      "%s: write: %s", filename,
601 			      isc_result_totext(result));
602 		(void)isc_stdio_close(fp);
603 		(void)isc_file_remove(filename);
604 		isc_mem_put(mctx, mem, size);
605 		return (ISC_R_UNEXPECTED);
606 	}
607 	isc_mem_put(mctx, mem, size);
608 
609 	result = isc_stdio_close(fp);
610 	if (result != ISC_R_SUCCESS) {
611 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
612 			      "%s: close: %s", filename,
613 			      isc_result_totext(result));
614 		(void)isc_file_remove(filename);
615 		return (ISC_R_UNEXPECTED);
616 	}
617 
618 	return (ISC_R_SUCCESS);
619 }
620 
621 static isc_result_t
journal_open(isc_mem_t * mctx,const char * filename,bool writable,bool create,bool downgrade,dns_journal_t ** journalp)622 journal_open(isc_mem_t *mctx, const char *filename, bool writable, bool create,
623 	     bool downgrade, dns_journal_t **journalp) {
624 	FILE *fp = NULL;
625 	isc_result_t result;
626 	journal_rawheader_t rawheader;
627 	dns_journal_t *j;
628 
629 	REQUIRE(journalp != NULL && *journalp == NULL);
630 
631 	j = isc_mem_get(mctx, sizeof(*j));
632 	*j = (dns_journal_t){ .state = JOURNAL_STATE_INVALID,
633 			      .filename = isc_mem_strdup(mctx, filename),
634 			      .xhdr_version = XHDR_VERSION2 };
635 	isc_mem_attach(mctx, &j->mctx);
636 
637 	result = isc_stdio_open(j->filename, writable ? "rb+" : "rb", &fp);
638 	if (result == ISC_R_FILENOTFOUND) {
639 		if (create) {
640 			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(1),
641 				      "journal file %s does not exist, "
642 				      "creating it",
643 				      j->filename);
644 			CHECK(journal_file_create(mctx, downgrade, filename));
645 			/*
646 			 * Retry.
647 			 */
648 			result = isc_stdio_open(j->filename, "rb+", &fp);
649 		} else {
650 			FAIL(ISC_R_NOTFOUND);
651 		}
652 	}
653 	if (result != ISC_R_SUCCESS) {
654 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
655 			      "%s: open: %s", j->filename,
656 			      isc_result_totext(result));
657 		FAIL(ISC_R_UNEXPECTED);
658 	}
659 
660 	j->fp = fp;
661 
662 	/*
663 	 * Set magic early so that seek/read can succeed.
664 	 */
665 	j->magic = DNS_JOURNAL_MAGIC;
666 
667 	CHECK(journal_seek(j, 0));
668 	CHECK(journal_read(j, &rawheader, sizeof(rawheader)));
669 
670 	if (memcmp(rawheader.h.format, journal_header_ver1.format,
671 		   sizeof(journal_header_ver1.format)) == 0)
672 	{
673 		/*
674 		 * The file header says it's the old format, but it
675 		 * still might have the new xhdr format because we
676 		 * forgot to change the format string when we introduced
677 		 * the new xhdr.  When we first try to read it, we assume
678 		 * it uses the new xhdr format. If that fails, we'll be
679 		 * called a second time with compat set to true, in which
680 		 * case we can lower xhdr_version to 1 if we find a
681 		 * corrupt transaction.
682 		 */
683 		j->header_ver1 = true;
684 	} else if (memcmp(rawheader.h.format, initial_journal_header.format,
685 			  sizeof(initial_journal_header.format)) == 0)
686 	{
687 		/*
688 		 * File header says this is format version 2; all
689 		 * transactions have to match.
690 		 */
691 		j->header_ver1 = false;
692 	} else {
693 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
694 			      "%s: journal format not recognized", j->filename);
695 		FAIL(ISC_R_UNEXPECTED);
696 	}
697 	journal_header_decode(&rawheader, &j->header);
698 
699 	/*
700 	 * If there is an index, read the raw index into a dynamically
701 	 * allocated buffer and then convert it into a cooked index.
702 	 */
703 	if (j->header.index_size != 0) {
704 		unsigned int i;
705 		unsigned int rawbytes;
706 		unsigned char *p;
707 
708 		rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
709 		j->rawindex = isc_mem_get(mctx, rawbytes);
710 
711 		CHECK(journal_read(j, j->rawindex, rawbytes));
712 
713 		j->index = isc_mem_get(mctx, j->header.index_size *
714 						     sizeof(journal_pos_t));
715 
716 		p = j->rawindex;
717 		for (i = 0; i < j->header.index_size; i++) {
718 			j->index[i].serial = decode_uint32(p);
719 			p += 4;
720 			j->index[i].offset = decode_uint32(p);
721 			p += 4;
722 		}
723 		INSIST(p == j->rawindex + rawbytes);
724 	}
725 	j->offset = -1; /* Invalid, must seek explicitly. */
726 
727 	/*
728 	 * Initialize the iterator.
729 	 */
730 	dns_name_init(&j->it.name, NULL);
731 	dns_rdata_init(&j->it.rdata);
732 
733 	/*
734 	 * Set up empty initial buffers for unchecked and checked
735 	 * wire format RR data.  They will be reallocated
736 	 * later.
737 	 */
738 	isc_buffer_init(&j->it.source, NULL, 0);
739 	isc_buffer_init(&j->it.target, NULL, 0);
740 	dns_decompress_init(&j->it.dctx, -1, DNS_DECOMPRESS_NONE);
741 
742 	j->state = writable ? JOURNAL_STATE_WRITE : JOURNAL_STATE_READ;
743 
744 	*journalp = j;
745 	return (ISC_R_SUCCESS);
746 
747 failure:
748 	j->magic = 0;
749 	if (j->rawindex != NULL) {
750 		isc_mem_put(j->mctx, j->rawindex,
751 			    j->header.index_size * sizeof(journal_rawpos_t));
752 	}
753 	if (j->index != NULL) {
754 		isc_mem_put(j->mctx, j->index,
755 			    j->header.index_size * sizeof(journal_pos_t));
756 	}
757 	isc_mem_free(j->mctx, j->filename);
758 	if (j->fp != NULL) {
759 		(void)isc_stdio_close(j->fp);
760 	}
761 	isc_mem_putanddetach(&j->mctx, j, sizeof(*j));
762 	return (result);
763 }
764 
765 isc_result_t
dns_journal_open(isc_mem_t * mctx,const char * filename,unsigned int mode,dns_journal_t ** journalp)766 dns_journal_open(isc_mem_t *mctx, const char *filename, unsigned int mode,
767 		 dns_journal_t **journalp) {
768 	isc_result_t result;
769 	size_t namelen;
770 	char backup[1024];
771 	bool writable, create;
772 
773 	create = ((mode & DNS_JOURNAL_CREATE) != 0);
774 	writable = ((mode & (DNS_JOURNAL_WRITE | DNS_JOURNAL_CREATE)) != 0);
775 
776 	result = journal_open(mctx, filename, writable, create, false,
777 			      journalp);
778 	if (result == ISC_R_NOTFOUND) {
779 		namelen = strlen(filename);
780 		if (namelen > 4U && strcmp(filename + namelen - 4, ".jnl") == 0)
781 		{
782 			namelen -= 4;
783 		}
784 
785 		result = snprintf(backup, sizeof(backup), "%.*s.jbk",
786 				  (int)namelen, filename);
787 		if (result >= sizeof(backup)) {
788 			return (ISC_R_NOSPACE);
789 		}
790 		result = journal_open(mctx, backup, writable, writable, false,
791 				      journalp);
792 	}
793 	return (result);
794 }
795 
796 /*
797  * A comparison function defining the sorting order for
798  * entries in the IXFR-style journal file.
799  *
800  * The IXFR format requires that deletions are sorted before
801  * additions, and within either one, SOA records are sorted
802  * before others.
803  *
804  * Also sort the non-SOA records by type as a courtesy to the
805  * server receiving the IXFR - it may help reduce the amount of
806  * rdataset merging it has to do.
807  */
808 static int
ixfr_order(const void * av,const void * bv)809 ixfr_order(const void *av, const void *bv) {
810 	dns_difftuple_t const *const *ap = av;
811 	dns_difftuple_t const *const *bp = bv;
812 	dns_difftuple_t const *a = *ap;
813 	dns_difftuple_t const *b = *bp;
814 	int r;
815 	int bop = 0, aop = 0;
816 
817 	switch (a->op) {
818 	case DNS_DIFFOP_DEL:
819 	case DNS_DIFFOP_DELRESIGN:
820 		aop = 1;
821 		break;
822 	case DNS_DIFFOP_ADD:
823 	case DNS_DIFFOP_ADDRESIGN:
824 		aop = 0;
825 		break;
826 	default:
827 		INSIST(0);
828 		ISC_UNREACHABLE();
829 	}
830 
831 	switch (b->op) {
832 	case DNS_DIFFOP_DEL:
833 	case DNS_DIFFOP_DELRESIGN:
834 		bop = 1;
835 		break;
836 	case DNS_DIFFOP_ADD:
837 	case DNS_DIFFOP_ADDRESIGN:
838 		bop = 0;
839 		break;
840 	default:
841 		INSIST(0);
842 		ISC_UNREACHABLE();
843 	}
844 
845 	r = bop - aop;
846 	if (r != 0) {
847 		return (r);
848 	}
849 
850 	r = (b->rdata.type == dns_rdatatype_soa) -
851 	    (a->rdata.type == dns_rdatatype_soa);
852 	if (r != 0) {
853 		return (r);
854 	}
855 
856 	r = (a->rdata.type - b->rdata.type);
857 	return (r);
858 }
859 
860 static isc_result_t
maybe_fixup_xhdr(dns_journal_t * j,journal_xhdr_t * xhdr,uint32_t serial,isc_offset_t offset)861 maybe_fixup_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr, uint32_t serial,
862 		 isc_offset_t offset) {
863 	isc_result_t result = ISC_R_SUCCESS;
864 
865 	/*
866 	 * Handle mixture of version 1 and version 2
867 	 * transaction headers in a version 1 journal.
868 	 */
869 	if ((xhdr->serial0 != serial ||
870 	     isc_serial_le(xhdr->serial1, xhdr->serial0))) {
871 		if (j->xhdr_version == XHDR_VERSION1 && xhdr->serial1 == serial)
872 		{
873 			isc_log_write(
874 				JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(3),
875 				"%s: XHDR_VERSION1 -> XHDR_VERSION2 at %u",
876 				j->filename, serial);
877 			j->xhdr_version = XHDR_VERSION2;
878 			CHECK(journal_seek(j, offset));
879 			CHECK(journal_read_xhdr(j, xhdr));
880 			j->recovered = true;
881 		} else if (j->xhdr_version == XHDR_VERSION2 &&
882 			   xhdr->count == serial) {
883 			isc_log_write(
884 				JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(3),
885 				"%s: XHDR_VERSION2 -> XHDR_VERSION1 at %u",
886 				j->filename, serial);
887 			j->xhdr_version = XHDR_VERSION1;
888 			CHECK(journal_seek(j, offset));
889 			CHECK(journal_read_xhdr(j, xhdr));
890 			j->recovered = true;
891 		}
892 	}
893 
894 	/*
895 	 * Handle <size, serial0, serial1, 0> transaction header.
896 	 */
897 	if (j->xhdr_version == XHDR_VERSION1) {
898 		uint32_t value;
899 
900 		CHECK(journal_read(j, &value, sizeof(value)));
901 		if (value != 0L) {
902 			CHECK(journal_seek(j, offset + 12));
903 		} else {
904 			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(3),
905 				      "%s: XHDR_VERSION1 count zero at %u",
906 				      j->filename, serial);
907 			j->xhdr_version = XHDR_VERSION2;
908 			j->recovered = true;
909 		}
910 	} else if (j->xhdr_version == XHDR_VERSION2 && xhdr->count == serial &&
911 		   xhdr->serial1 == 0U &&
912 		   isc_serial_gt(xhdr->serial0, xhdr->count))
913 	{
914 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(3),
915 			      "%s: XHDR_VERSION2 count zero at %u", j->filename,
916 			      serial);
917 		xhdr->serial1 = xhdr->serial0;
918 		xhdr->serial0 = xhdr->count;
919 		xhdr->count = 0;
920 		j->recovered = true;
921 	}
922 
923 failure:
924 	return (result);
925 }
926 
927 /*
928  * Advance '*pos' to the next journal transaction.
929  *
930  * Requires:
931  *	*pos refers to a valid journal transaction.
932  *
933  * Ensures:
934  *	When ISC_R_SUCCESS is returned,
935  *	*pos refers to the next journal transaction.
936  *
937  * Returns one of:
938  *
939  *    ISC_R_SUCCESS
940  *    ISC_R_NOMORE 	*pos pointed at the last transaction
941  *    Other results due to file errors are possible.
942  */
943 static isc_result_t
journal_next(dns_journal_t * j,journal_pos_t * pos)944 journal_next(dns_journal_t *j, journal_pos_t *pos) {
945 	isc_result_t result;
946 	journal_xhdr_t xhdr;
947 	size_t hdrsize;
948 
949 	REQUIRE(DNS_JOURNAL_VALID(j));
950 
951 	result = journal_seek(j, pos->offset);
952 	if (result != ISC_R_SUCCESS) {
953 		return (result);
954 	}
955 
956 	if (pos->serial == j->header.end.serial) {
957 		return (ISC_R_NOMORE);
958 	}
959 
960 	/*
961 	 * Read the header of the current transaction.
962 	 * This will return ISC_R_NOMORE if we are at EOF.
963 	 */
964 	result = journal_read_xhdr(j, &xhdr);
965 	if (result != ISC_R_SUCCESS) {
966 		return (result);
967 	}
968 
969 	if (j->header_ver1) {
970 		CHECK(maybe_fixup_xhdr(j, &xhdr, pos->serial, pos->offset));
971 	}
972 
973 	/*
974 	 * Check serial number consistency.
975 	 */
976 	if (xhdr.serial0 != pos->serial ||
977 	    isc_serial_le(xhdr.serial1, xhdr.serial0)) {
978 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
979 			      "%s: journal file corrupt: "
980 			      "expected serial %u, got %u",
981 			      j->filename, pos->serial, xhdr.serial0);
982 		return (ISC_R_UNEXPECTED);
983 	}
984 
985 	/*
986 	 * Check for offset wraparound.
987 	 */
988 	hdrsize = (j->xhdr_version == XHDR_VERSION2)
989 			  ? sizeof(journal_rawxhdr_t)
990 			  : sizeof(journal_rawxhdr_ver1_t);
991 
992 	if ((isc_offset_t)(pos->offset + hdrsize + xhdr.size) < pos->offset) {
993 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
994 			      "%s: offset too large", j->filename);
995 		return (ISC_R_UNEXPECTED);
996 	}
997 
998 	pos->offset += hdrsize + xhdr.size;
999 	pos->serial = xhdr.serial1;
1000 	return (ISC_R_SUCCESS);
1001 
1002 failure:
1003 	return (result);
1004 }
1005 
1006 /*
1007  * If the index of the journal 'j' contains an entry "better"
1008  * than '*best_guess', replace '*best_guess' with it.
1009  *
1010  * "Better" means having a serial number closer to 'serial'
1011  * but not greater than 'serial'.
1012  */
1013 static void
index_find(dns_journal_t * j,uint32_t serial,journal_pos_t * best_guess)1014 index_find(dns_journal_t *j, uint32_t serial, journal_pos_t *best_guess) {
1015 	unsigned int i;
1016 	if (j->index == NULL) {
1017 		return;
1018 	}
1019 	for (i = 0; i < j->header.index_size; i++) {
1020 		if (POS_VALID(j->index[i]) &&
1021 		    DNS_SERIAL_GE(serial, j->index[i].serial) &&
1022 		    DNS_SERIAL_GT(j->index[i].serial, best_guess->serial))
1023 		{
1024 			*best_guess = j->index[i];
1025 		}
1026 	}
1027 }
1028 
1029 /*
1030  * Add a new index entry.  If there is no room, make room by removing
1031  * the odd-numbered entries and compacting the others into the first
1032  * half of the index.  This decimates old index entries exponentially
1033  * over time, so that the index always contains a much larger fraction
1034  * of recent serial numbers than of old ones.  This is deliberate -
1035  * most index searches are for outgoing IXFR, and IXFR tends to request
1036  * recent versions more often than old ones.
1037  */
1038 static void
index_add(dns_journal_t * j,journal_pos_t * pos)1039 index_add(dns_journal_t *j, journal_pos_t *pos) {
1040 	unsigned int i;
1041 
1042 	if (j->index == NULL) {
1043 		return;
1044 	}
1045 
1046 	/*
1047 	 * Search for a vacant position.
1048 	 */
1049 	for (i = 0; i < j->header.index_size; i++) {
1050 		if (!POS_VALID(j->index[i])) {
1051 			break;
1052 		}
1053 	}
1054 	if (i == j->header.index_size) {
1055 		unsigned int k = 0;
1056 		/*
1057 		 * Found no vacant position.  Make some room.
1058 		 */
1059 		for (i = 0; i < j->header.index_size; i += 2) {
1060 			j->index[k++] = j->index[i];
1061 		}
1062 		i = k; /* 'i' identifies the first vacant position. */
1063 		while (k < j->header.index_size) {
1064 			POS_INVALIDATE(j->index[k]);
1065 			k++;
1066 		}
1067 	}
1068 	INSIST(i < j->header.index_size);
1069 	INSIST(!POS_VALID(j->index[i]));
1070 
1071 	/*
1072 	 * Store the new index entry.
1073 	 */
1074 	j->index[i] = *pos;
1075 }
1076 
1077 /*
1078  * Invalidate any existing index entries that could become
1079  * ambiguous when a new transaction with number 'serial' is added.
1080  */
1081 static void
index_invalidate(dns_journal_t * j,uint32_t serial)1082 index_invalidate(dns_journal_t *j, uint32_t serial) {
1083 	unsigned int i;
1084 	if (j->index == NULL) {
1085 		return;
1086 	}
1087 	for (i = 0; i < j->header.index_size; i++) {
1088 		if (!DNS_SERIAL_GT(serial, j->index[i].serial)) {
1089 			POS_INVALIDATE(j->index[i]);
1090 		}
1091 	}
1092 }
1093 
1094 /*
1095  * Try to find a transaction with initial serial number 'serial'
1096  * in the journal 'j'.
1097  *
1098  * If found, store its position at '*pos' and return ISC_R_SUCCESS.
1099  *
1100  * If 'serial' is current (= the ending serial number of the
1101  * last transaction in the journal), set '*pos' to
1102  * the position immediately following the last transaction and
1103  * return ISC_R_SUCCESS.
1104  *
1105  * If 'serial' is within the range of addressable serial numbers
1106  * covered by the journal but that particular serial number is missing
1107  * (from the journal, not just from the index), return ISC_R_NOTFOUND.
1108  *
1109  * If 'serial' is outside the range of addressable serial numbers
1110  * covered by the journal, return ISC_R_RANGE.
1111  *
1112  */
1113 static isc_result_t
journal_find(dns_journal_t * j,uint32_t serial,journal_pos_t * pos)1114 journal_find(dns_journal_t *j, uint32_t serial, journal_pos_t *pos) {
1115 	isc_result_t result;
1116 	journal_pos_t current_pos;
1117 
1118 	REQUIRE(DNS_JOURNAL_VALID(j));
1119 
1120 	if (DNS_SERIAL_GT(j->header.begin.serial, serial)) {
1121 		return (ISC_R_RANGE);
1122 	}
1123 	if (DNS_SERIAL_GT(serial, j->header.end.serial)) {
1124 		return (ISC_R_RANGE);
1125 	}
1126 	if (serial == j->header.end.serial) {
1127 		*pos = j->header.end;
1128 		return (ISC_R_SUCCESS);
1129 	}
1130 
1131 	current_pos = j->header.begin;
1132 	index_find(j, serial, &current_pos);
1133 
1134 	while (current_pos.serial != serial) {
1135 		if (DNS_SERIAL_GT(current_pos.serial, serial)) {
1136 			return (ISC_R_NOTFOUND);
1137 		}
1138 		result = journal_next(j, &current_pos);
1139 		if (result != ISC_R_SUCCESS) {
1140 			return (result);
1141 		}
1142 	}
1143 	*pos = current_pos;
1144 	return (ISC_R_SUCCESS);
1145 }
1146 
1147 isc_result_t
dns_journal_begin_transaction(dns_journal_t * j)1148 dns_journal_begin_transaction(dns_journal_t *j) {
1149 	uint32_t offset;
1150 	isc_result_t result;
1151 
1152 	REQUIRE(DNS_JOURNAL_VALID(j));
1153 	REQUIRE(j->state == JOURNAL_STATE_WRITE ||
1154 		j->state == JOURNAL_STATE_INLINE);
1155 
1156 	/*
1157 	 * Find the file offset where the new transaction should
1158 	 * be written, and seek there.
1159 	 */
1160 	if (JOURNAL_EMPTY(&j->header)) {
1161 		offset = sizeof(journal_rawheader_t) +
1162 			 j->header.index_size * sizeof(journal_rawpos_t);
1163 	} else {
1164 		offset = j->header.end.offset;
1165 	}
1166 	j->x.pos[0].offset = offset;
1167 	j->x.pos[1].offset = offset; /* Initial value, will be incremented. */
1168 	j->x.n_soa = 0;
1169 
1170 	CHECK(journal_seek(j, offset));
1171 
1172 	/*
1173 	 * Write a dummy transaction header of all zeroes to reserve
1174 	 * space.  It will be filled in when the transaction is
1175 	 * finished.
1176 	 */
1177 	CHECK(journal_write_xhdr(j, 0, 0, 0, 0));
1178 	j->x.pos[1].offset = j->offset;
1179 
1180 	j->state = JOURNAL_STATE_TRANSACTION;
1181 	result = ISC_R_SUCCESS;
1182 failure:
1183 	return (result);
1184 }
1185 
1186 isc_result_t
dns_journal_writediff(dns_journal_t * j,dns_diff_t * diff)1187 dns_journal_writediff(dns_journal_t *j, dns_diff_t *diff) {
1188 	dns_difftuple_t *t;
1189 	isc_buffer_t buffer;
1190 	void *mem = NULL;
1191 	uint64_t size = 0;
1192 	uint32_t rrcount = 0;
1193 	isc_result_t result;
1194 	isc_region_t used;
1195 
1196 	REQUIRE(DNS_DIFF_VALID(diff));
1197 	REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
1198 
1199 	isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "writing to journal");
1200 	(void)dns_diff_print(diff, NULL);
1201 
1202 	/*
1203 	 * Pass 1: determine the buffer size needed, and
1204 	 * keep track of SOA serial numbers.
1205 	 */
1206 	for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
1207 	     t = ISC_LIST_NEXT(t, link)) {
1208 		if (t->rdata.type == dns_rdatatype_soa) {
1209 			if (j->x.n_soa < 2) {
1210 				j->x.pos[j->x.n_soa].serial =
1211 					dns_soa_getserial(&t->rdata);
1212 			}
1213 			j->x.n_soa++;
1214 		}
1215 		size += sizeof(journal_rawrrhdr_t);
1216 		size += t->name.length; /* XXX should have access macro? */
1217 		size += 10;
1218 		size += t->rdata.length;
1219 	}
1220 
1221 	if (size >= DNS_JOURNAL_SIZE_MAX) {
1222 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1223 			      "dns_journal_writediff: %s: journal entry "
1224 			      "too big to be stored: %" PRIu64 " bytes",
1225 			      j->filename, size);
1226 		return (ISC_R_NOSPACE);
1227 	}
1228 
1229 	mem = isc_mem_get(j->mctx, size);
1230 
1231 	isc_buffer_init(&buffer, mem, size);
1232 
1233 	/*
1234 	 * Pass 2.  Write RRs to buffer.
1235 	 */
1236 	for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
1237 	     t = ISC_LIST_NEXT(t, link)) {
1238 		/*
1239 		 * Write the RR header.
1240 		 */
1241 		isc_buffer_putuint32(&buffer,
1242 				     t->name.length + 10 + t->rdata.length);
1243 		/*
1244 		 * Write the owner name, RR header, and RR data.
1245 		 */
1246 		isc_buffer_putmem(&buffer, t->name.ndata, t->name.length);
1247 		isc_buffer_putuint16(&buffer, t->rdata.type);
1248 		isc_buffer_putuint16(&buffer, t->rdata.rdclass);
1249 		isc_buffer_putuint32(&buffer, t->ttl);
1250 		INSIST(t->rdata.length < 65536);
1251 		isc_buffer_putuint16(&buffer, (uint16_t)t->rdata.length);
1252 		INSIST(isc_buffer_availablelength(&buffer) >= t->rdata.length);
1253 		isc_buffer_putmem(&buffer, t->rdata.data, t->rdata.length);
1254 
1255 		rrcount++;
1256 	}
1257 
1258 	isc_buffer_usedregion(&buffer, &used);
1259 	INSIST(used.length == size);
1260 
1261 	j->x.pos[1].offset += used.length;
1262 	j->x.n_rr = rrcount;
1263 
1264 	/*
1265 	 * Write the buffer contents to the journal file.
1266 	 */
1267 	CHECK(journal_write(j, used.base, used.length));
1268 
1269 	result = ISC_R_SUCCESS;
1270 
1271 failure:
1272 	if (mem != NULL) {
1273 		isc_mem_put(j->mctx, mem, size);
1274 	}
1275 	return (result);
1276 }
1277 
1278 isc_result_t
dns_journal_commit(dns_journal_t * j)1279 dns_journal_commit(dns_journal_t *j) {
1280 	isc_result_t result;
1281 	journal_rawheader_t rawheader;
1282 	uint64_t total;
1283 
1284 	REQUIRE(DNS_JOURNAL_VALID(j));
1285 	REQUIRE(j->state == JOURNAL_STATE_TRANSACTION ||
1286 		j->state == JOURNAL_STATE_INLINE);
1287 
1288 	/*
1289 	 * Just write out a updated header.
1290 	 */
1291 	if (j->state == JOURNAL_STATE_INLINE) {
1292 		CHECK(journal_fsync(j));
1293 		journal_header_encode(&j->header, &rawheader);
1294 		CHECK(journal_seek(j, 0));
1295 		CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
1296 		CHECK(journal_fsync(j));
1297 		j->state = JOURNAL_STATE_WRITE;
1298 		return (ISC_R_SUCCESS);
1299 	}
1300 
1301 	/*
1302 	 * Perform some basic consistency checks.
1303 	 */
1304 	if (j->x.n_soa != 2) {
1305 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1306 			      "%s: malformed transaction: %d SOAs", j->filename,
1307 			      j->x.n_soa);
1308 		return (ISC_R_UNEXPECTED);
1309 	}
1310 	if (!DNS_SERIAL_GT(j->x.pos[1].serial, j->x.pos[0].serial)) {
1311 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1312 			      "%s: malformed transaction: serial number "
1313 			      "did not increase",
1314 			      j->filename);
1315 		return (ISC_R_UNEXPECTED);
1316 	}
1317 	if (!JOURNAL_EMPTY(&j->header)) {
1318 		if (j->x.pos[0].serial != j->header.end.serial) {
1319 			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1320 				      "malformed transaction: "
1321 				      "%s last serial %u != "
1322 				      "transaction first serial %u",
1323 				      j->filename, j->header.end.serial,
1324 				      j->x.pos[0].serial);
1325 			return (ISC_R_UNEXPECTED);
1326 		}
1327 	}
1328 
1329 	/*
1330 	 * We currently don't support huge journal entries.
1331 	 */
1332 	total = j->x.pos[1].offset - j->x.pos[0].offset;
1333 	if (total >= DNS_JOURNAL_SIZE_MAX) {
1334 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1335 			      "transaction too big to be stored in journal: "
1336 			      "%" PRIu64 "b (max is %" PRIu64 "b)",
1337 			      total, (uint64_t)DNS_JOURNAL_SIZE_MAX);
1338 		return (ISC_R_UNEXPECTED);
1339 	}
1340 
1341 	/*
1342 	 * Some old journal entries may become non-addressable
1343 	 * when we increment the current serial number.  Purge them
1344 	 * by stepping header.begin forward to the first addressable
1345 	 * transaction.  Also purge them from the index.
1346 	 */
1347 	if (!JOURNAL_EMPTY(&j->header)) {
1348 		while (!DNS_SERIAL_GT(j->x.pos[1].serial,
1349 				      j->header.begin.serial)) {
1350 			CHECK(journal_next(j, &j->header.begin));
1351 		}
1352 		index_invalidate(j, j->x.pos[1].serial);
1353 	}
1354 #ifdef notyet
1355 	if (DNS_SERIAL_GT(last_dumped_serial, j->x.pos[1].serial)) {
1356 		force_dump(...);
1357 	}
1358 #endif /* ifdef notyet */
1359 
1360 	/*
1361 	 * Commit the transaction data to stable storage.
1362 	 */
1363 	CHECK(journal_fsync(j));
1364 
1365 	if (j->state == JOURNAL_STATE_TRANSACTION) {
1366 		isc_offset_t offset;
1367 		offset = (j->x.pos[1].offset - j->x.pos[0].offset) -
1368 			 (j->header_ver1 ? sizeof(journal_rawxhdr_ver1_t)
1369 					 : sizeof(journal_rawxhdr_t));
1370 		/*
1371 		 * Update the transaction header.
1372 		 */
1373 		CHECK(journal_seek(j, j->x.pos[0].offset));
1374 		CHECK(journal_write_xhdr(j, offset, j->x.n_rr,
1375 					 j->x.pos[0].serial,
1376 					 j->x.pos[1].serial));
1377 	}
1378 
1379 	/*
1380 	 * Update the journal header.
1381 	 */
1382 	if (JOURNAL_EMPTY(&j->header)) {
1383 		j->header.begin = j->x.pos[0];
1384 	}
1385 	j->header.end = j->x.pos[1];
1386 	journal_header_encode(&j->header, &rawheader);
1387 	CHECK(journal_seek(j, 0));
1388 	CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
1389 
1390 	/*
1391 	 * Update the index.
1392 	 */
1393 	index_add(j, &j->x.pos[0]);
1394 
1395 	/*
1396 	 * Convert the index into on-disk format and write
1397 	 * it to disk.
1398 	 */
1399 	CHECK(index_to_disk(j));
1400 
1401 	/*
1402 	 * Commit the header to stable storage.
1403 	 */
1404 	CHECK(journal_fsync(j));
1405 
1406 	/*
1407 	 * We no longer have a transaction open.
1408 	 */
1409 	j->state = JOURNAL_STATE_WRITE;
1410 
1411 	result = ISC_R_SUCCESS;
1412 
1413 failure:
1414 	return (result);
1415 }
1416 
1417 isc_result_t
dns_journal_write_transaction(dns_journal_t * j,dns_diff_t * diff)1418 dns_journal_write_transaction(dns_journal_t *j, dns_diff_t *diff) {
1419 	isc_result_t result;
1420 
1421 	CHECK(dns_diff_sort(diff, ixfr_order));
1422 	CHECK(dns_journal_begin_transaction(j));
1423 	CHECK(dns_journal_writediff(j, diff));
1424 	CHECK(dns_journal_commit(j));
1425 	result = ISC_R_SUCCESS;
1426 failure:
1427 	return (result);
1428 }
1429 
1430 void
dns_journal_destroy(dns_journal_t ** journalp)1431 dns_journal_destroy(dns_journal_t **journalp) {
1432 	dns_journal_t *j = NULL;
1433 
1434 	REQUIRE(journalp != NULL);
1435 	REQUIRE(DNS_JOURNAL_VALID(*journalp));
1436 
1437 	j = *journalp;
1438 	*journalp = NULL;
1439 
1440 	j->it.result = ISC_R_FAILURE;
1441 	dns_name_invalidate(&j->it.name);
1442 	dns_decompress_invalidate(&j->it.dctx);
1443 	if (j->rawindex != NULL) {
1444 		isc_mem_put(j->mctx, j->rawindex,
1445 			    j->header.index_size * sizeof(journal_rawpos_t));
1446 	}
1447 	if (j->index != NULL) {
1448 		isc_mem_put(j->mctx, j->index,
1449 			    j->header.index_size * sizeof(journal_pos_t));
1450 	}
1451 	if (j->it.target.base != NULL) {
1452 		isc_mem_put(j->mctx, j->it.target.base, j->it.target.length);
1453 	}
1454 	if (j->it.source.base != NULL) {
1455 		isc_mem_put(j->mctx, j->it.source.base, j->it.source.length);
1456 	}
1457 	if (j->filename != NULL) {
1458 		isc_mem_free(j->mctx, j->filename);
1459 	}
1460 	if (j->fp != NULL) {
1461 		(void)isc_stdio_close(j->fp);
1462 	}
1463 	j->magic = 0;
1464 	isc_mem_putanddetach(&j->mctx, j, sizeof(*j));
1465 }
1466 
1467 /*
1468  * Roll the open journal 'j' into the database 'db'.
1469  * A new database version will be created.
1470  */
1471 
1472 /* XXX Share code with incoming IXFR? */
1473 
1474 isc_result_t
dns_journal_rollforward(dns_journal_t * j,dns_db_t * db,unsigned int options)1475 dns_journal_rollforward(dns_journal_t *j, dns_db_t *db, unsigned int options) {
1476 	isc_buffer_t source; /* Transaction data from disk */
1477 	isc_buffer_t target; /* Ditto after _fromwire check */
1478 	uint32_t db_serial;  /* Database SOA serial */
1479 	uint32_t end_serial; /* Last journal SOA serial */
1480 	isc_result_t result;
1481 	dns_dbversion_t *ver = NULL;
1482 	journal_pos_t pos;
1483 	dns_diff_t diff;
1484 	unsigned int n_soa = 0;
1485 	unsigned int n_put = 0;
1486 	dns_diffop_t op;
1487 
1488 	REQUIRE(DNS_JOURNAL_VALID(j));
1489 	REQUIRE(DNS_DB_VALID(db));
1490 
1491 	dns_diff_init(j->mctx, &diff);
1492 
1493 	/*
1494 	 * Set up empty initial buffers for unchecked and checked
1495 	 * wire format transaction data.  They will be reallocated
1496 	 * later.
1497 	 */
1498 	isc_buffer_init(&source, NULL, 0);
1499 	isc_buffer_init(&target, NULL, 0);
1500 
1501 	/*
1502 	 * Create the new database version.
1503 	 */
1504 	CHECK(dns_db_newversion(db, &ver));
1505 
1506 	/*
1507 	 * Get the current database SOA serial number.
1508 	 */
1509 	CHECK(dns_db_getsoaserial(db, ver, &db_serial));
1510 
1511 	/*
1512 	 * Locate a journal entry for the current database serial.
1513 	 */
1514 	CHECK(journal_find(j, db_serial, &pos));
1515 
1516 	end_serial = dns_journal_last_serial(j);
1517 
1518 	/*
1519 	 * If we're reading a version 1 file, scan all the transactions
1520 	 * to see if the journal needs rewriting: if any outdated
1521 	 * transaction headers are found, j->recovered will be set.
1522 	 */
1523 	if (j->header_ver1) {
1524 		uint32_t start_serial = dns_journal_first_serial(j);
1525 
1526 		CHECK(dns_journal_iter_init(j, start_serial, db_serial, NULL));
1527 		for (result = dns_journal_first_rr(j); result == ISC_R_SUCCESS;
1528 		     result = dns_journal_next_rr(j))
1529 		{
1530 			continue;
1531 		}
1532 	}
1533 
1534 	if (db_serial == end_serial) {
1535 		CHECK(DNS_R_UPTODATE);
1536 	}
1537 
1538 	CHECK(dns_journal_iter_init(j, db_serial, end_serial, NULL));
1539 	for (result = dns_journal_first_rr(j); result == ISC_R_SUCCESS;
1540 	     result = dns_journal_next_rr(j))
1541 	{
1542 		dns_name_t *name = NULL;
1543 		dns_rdata_t *rdata = NULL;
1544 		dns_difftuple_t *tuple = NULL;
1545 		uint32_t ttl;
1546 
1547 		dns_journal_current_rr(j, &name, &ttl, &rdata);
1548 
1549 		if (rdata->type == dns_rdatatype_soa) {
1550 			n_soa++;
1551 			if (n_soa == 2) {
1552 				db_serial = j->it.current_serial;
1553 			}
1554 		}
1555 
1556 		if (n_soa == 3) {
1557 			n_soa = 1;
1558 		}
1559 		if (n_soa == 0) {
1560 			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1561 				      "%s: journal file corrupt: missing "
1562 				      "initial SOA",
1563 				      j->filename);
1564 			FAIL(ISC_R_UNEXPECTED);
1565 		}
1566 		if ((options & DNS_JOURNALOPT_RESIGN) != 0) {
1567 			op = (n_soa == 1) ? DNS_DIFFOP_DELRESIGN
1568 					  : DNS_DIFFOP_ADDRESIGN;
1569 		} else {
1570 			op = (n_soa == 1) ? DNS_DIFFOP_DEL : DNS_DIFFOP_ADD;
1571 		}
1572 
1573 		CHECK(dns_difftuple_create(diff.mctx, op, name, ttl, rdata,
1574 					   &tuple));
1575 		dns_diff_append(&diff, &tuple);
1576 
1577 		if (++n_put > 100) {
1578 			isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1579 				      "%s: applying diff to database (%u)",
1580 				      j->filename, db_serial);
1581 			(void)dns_diff_print(&diff, NULL);
1582 			CHECK(dns_diff_apply(&diff, db, ver));
1583 			dns_diff_clear(&diff);
1584 			n_put = 0;
1585 		}
1586 	}
1587 	if (result == ISC_R_NOMORE) {
1588 		result = ISC_R_SUCCESS;
1589 	}
1590 	CHECK(result);
1591 
1592 	if (n_put != 0) {
1593 		isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1594 			      "%s: applying final diff to database (%u)",
1595 			      j->filename, db_serial);
1596 		(void)dns_diff_print(&diff, NULL);
1597 		CHECK(dns_diff_apply(&diff, db, ver));
1598 		dns_diff_clear(&diff);
1599 	}
1600 
1601 failure:
1602 	if (ver != NULL) {
1603 		dns_db_closeversion(db, &ver,
1604 				    result == ISC_R_SUCCESS ? true : false);
1605 	}
1606 
1607 	if (source.base != NULL) {
1608 		isc_mem_put(j->mctx, source.base, source.length);
1609 	}
1610 	if (target.base != NULL) {
1611 		isc_mem_put(j->mctx, target.base, target.length);
1612 	}
1613 
1614 	dns_diff_clear(&diff);
1615 
1616 	INSIST(ver == NULL);
1617 
1618 	return (result);
1619 }
1620 
1621 isc_result_t
dns_journal_print(isc_mem_t * mctx,uint32_t flags,const char * filename,FILE * file)1622 dns_journal_print(isc_mem_t *mctx, uint32_t flags, const char *filename,
1623 		  FILE *file) {
1624 	dns_journal_t *j = NULL;
1625 	isc_buffer_t source;   /* Transaction data from disk */
1626 	isc_buffer_t target;   /* Ditto after _fromwire check */
1627 	uint32_t start_serial; /* Database SOA serial */
1628 	uint32_t end_serial;   /* Last journal SOA serial */
1629 	isc_result_t result;
1630 	dns_diff_t diff;
1631 	unsigned int n_soa = 0;
1632 	unsigned int n_put = 0;
1633 	bool printxhdr = ((flags & DNS_JOURNAL_PRINTXHDR) != 0);
1634 
1635 	REQUIRE(filename != NULL);
1636 
1637 	result = dns_journal_open(mctx, filename, DNS_JOURNAL_READ, &j);
1638 	if (result == ISC_R_NOTFOUND) {
1639 		isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no journal file");
1640 		return (DNS_R_NOJOURNAL);
1641 	} else if (result != ISC_R_SUCCESS) {
1642 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1643 			      "journal open failure: %s: %s",
1644 			      isc_result_totext(result), filename);
1645 		return (result);
1646 	}
1647 
1648 	if (printxhdr) {
1649 		fprintf(file, "Journal format = %sHeader version = %d\n",
1650 			j->header.format + 1, j->header_ver1 ? 1 : 2);
1651 		fprintf(file, "Start serial = %u\n", j->header.begin.serial);
1652 		fprintf(file, "End serial = %u\n", j->header.end.serial);
1653 		fprintf(file, "Index (size = %u):\n", j->header.index_size);
1654 		for (uint32_t i = 0; i < j->header.index_size; i++) {
1655 			if (j->index[i].offset == 0) {
1656 				fputc('\n', file);
1657 				break;
1658 			}
1659 			fprintf(file, "%lld", (long long)j->index[i].offset);
1660 			fputc((i + 1) % 8 == 0 ? '\n' : ' ', file);
1661 		}
1662 	}
1663 	if (j->header.serialset) {
1664 		fprintf(file, "Source serial = %u\n", j->header.sourceserial);
1665 	}
1666 	dns_diff_init(j->mctx, &diff);
1667 
1668 	/*
1669 	 * Set up empty initial buffers for unchecked and checked
1670 	 * wire format transaction data.  They will be reallocated
1671 	 * later.
1672 	 */
1673 	isc_buffer_init(&source, NULL, 0);
1674 	isc_buffer_init(&target, NULL, 0);
1675 
1676 	start_serial = dns_journal_first_serial(j);
1677 	end_serial = dns_journal_last_serial(j);
1678 
1679 	CHECK(dns_journal_iter_init(j, start_serial, end_serial, NULL));
1680 
1681 	for (result = dns_journal_first_rr(j); result == ISC_R_SUCCESS;
1682 	     result = dns_journal_next_rr(j))
1683 	{
1684 		dns_name_t *name = NULL;
1685 		dns_rdata_t *rdata = NULL;
1686 		dns_difftuple_t *tuple = NULL;
1687 		static uint32_t i = 0;
1688 		bool print = false;
1689 		uint32_t ttl;
1690 
1691 		dns_journal_current_rr(j, &name, &ttl, &rdata);
1692 
1693 		if (rdata->type == dns_rdatatype_soa) {
1694 			n_soa++;
1695 			if (n_soa == 3) {
1696 				n_soa = 1;
1697 			}
1698 			if (n_soa == 1) {
1699 				print = printxhdr;
1700 			}
1701 		}
1702 		if (n_soa == 0) {
1703 			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1704 				      "%s: journal file corrupt: missing "
1705 				      "initial SOA",
1706 				      j->filename);
1707 			FAIL(ISC_R_UNEXPECTED);
1708 		}
1709 
1710 		if (print) {
1711 			fprintf(file,
1712 				"Transaction: version %d offset %lld size %u "
1713 				"rrcount %u start %u end %u\n",
1714 				j->xhdr_version, (long long)j->it.cpos.offset,
1715 				j->curxhdr.size, j->curxhdr.count,
1716 				j->curxhdr.serial0, j->curxhdr.serial1);
1717 			if (j->it.cpos.offset > j->index[i].offset) {
1718 				fprintf(file,
1719 					"ERROR: Offset mismatch, "
1720 					"expected %lld\n",
1721 					(long long)j->index[i].offset);
1722 			} else if (j->it.cpos.offset == j->index[i].offset) {
1723 				i++;
1724 			}
1725 		}
1726 		CHECK(dns_difftuple_create(
1727 			diff.mctx, n_soa == 1 ? DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1728 			name, ttl, rdata, &tuple));
1729 		dns_diff_append(&diff, &tuple);
1730 
1731 		if (++n_put > 100 || printxhdr) {
1732 			result = dns_diff_print(&diff, file);
1733 			dns_diff_clear(&diff);
1734 			n_put = 0;
1735 			if (result != ISC_R_SUCCESS) {
1736 				break;
1737 			}
1738 		}
1739 	}
1740 	if (result == ISC_R_NOMORE) {
1741 		result = ISC_R_SUCCESS;
1742 	}
1743 	CHECK(result);
1744 
1745 	if (n_put != 0) {
1746 		result = dns_diff_print(&diff, file);
1747 		dns_diff_clear(&diff);
1748 	}
1749 	goto cleanup;
1750 
1751 failure:
1752 	isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1753 		      "%s: cannot print: journal file corrupt", j->filename);
1754 
1755 cleanup:
1756 	if (source.base != NULL) {
1757 		isc_mem_put(j->mctx, source.base, source.length);
1758 	}
1759 	if (target.base != NULL) {
1760 		isc_mem_put(j->mctx, target.base, target.length);
1761 	}
1762 
1763 	dns_diff_clear(&diff);
1764 	dns_journal_destroy(&j);
1765 
1766 	return (result);
1767 }
1768 
1769 /**************************************************************************/
1770 /*
1771  * Miscellaneous accessors.
1772  */
1773 bool
dns_journal_empty(dns_journal_t * j)1774 dns_journal_empty(dns_journal_t *j) {
1775 	return (JOURNAL_EMPTY(&j->header));
1776 }
1777 
1778 bool
dns_journal_recovered(dns_journal_t * j)1779 dns_journal_recovered(dns_journal_t *j) {
1780 	return (j->recovered);
1781 }
1782 
1783 uint32_t
dns_journal_first_serial(dns_journal_t * j)1784 dns_journal_first_serial(dns_journal_t *j) {
1785 	return (j->header.begin.serial);
1786 }
1787 
1788 uint32_t
dns_journal_last_serial(dns_journal_t * j)1789 dns_journal_last_serial(dns_journal_t *j) {
1790 	return (j->header.end.serial);
1791 }
1792 
1793 void
dns_journal_set_sourceserial(dns_journal_t * j,uint32_t sourceserial)1794 dns_journal_set_sourceserial(dns_journal_t *j, uint32_t sourceserial) {
1795 	REQUIRE(j->state == JOURNAL_STATE_WRITE ||
1796 		j->state == JOURNAL_STATE_INLINE ||
1797 		j->state == JOURNAL_STATE_TRANSACTION);
1798 
1799 	j->header.sourceserial = sourceserial;
1800 	j->header.serialset = true;
1801 	if (j->state == JOURNAL_STATE_WRITE) {
1802 		j->state = JOURNAL_STATE_INLINE;
1803 	}
1804 }
1805 
1806 bool
dns_journal_get_sourceserial(dns_journal_t * j,uint32_t * sourceserial)1807 dns_journal_get_sourceserial(dns_journal_t *j, uint32_t *sourceserial) {
1808 	REQUIRE(sourceserial != NULL);
1809 
1810 	if (!j->header.serialset) {
1811 		return (false);
1812 	}
1813 	*sourceserial = j->header.sourceserial;
1814 	return (true);
1815 }
1816 
1817 /**************************************************************************/
1818 /*
1819  * Iteration support.
1820  *
1821  * When serving an outgoing IXFR, we transmit a part the journal starting
1822  * at the serial number in the IXFR request and ending at the serial
1823  * number that is current when the IXFR request arrives.  The ending
1824  * serial number is not necessarily at the end of the journal:
1825  * the journal may grow while the IXFR is in progress, but we stop
1826  * when we reach the serial number that was current when the IXFR started.
1827  */
1828 
1829 static isc_result_t
1830 read_one_rr(dns_journal_t *j);
1831 
1832 /*
1833  * Make sure the buffer 'b' is has at least 'size' bytes
1834  * allocated, and clear it.
1835  *
1836  * Requires:
1837  *	Either b->base is NULL, or it points to b->length bytes of memory
1838  *	previously allocated by isc_mem_get().
1839  */
1840 
1841 static isc_result_t
size_buffer(isc_mem_t * mctx,isc_buffer_t * b,unsigned size)1842 size_buffer(isc_mem_t *mctx, isc_buffer_t *b, unsigned size) {
1843 	if (b->length < size) {
1844 		void *mem = isc_mem_get(mctx, size);
1845 		if (mem == NULL) {
1846 			return (ISC_R_NOMEMORY);
1847 		}
1848 		if (b->base != NULL) {
1849 			isc_mem_put(mctx, b->base, b->length);
1850 		}
1851 		b->base = mem;
1852 		b->length = size;
1853 	}
1854 	isc_buffer_clear(b);
1855 	return (ISC_R_SUCCESS);
1856 }
1857 
1858 isc_result_t
dns_journal_iter_init(dns_journal_t * j,uint32_t begin_serial,uint32_t end_serial,size_t * xfrsizep)1859 dns_journal_iter_init(dns_journal_t *j, uint32_t begin_serial,
1860 		      uint32_t end_serial, size_t *xfrsizep) {
1861 	isc_result_t result;
1862 
1863 	CHECK(journal_find(j, begin_serial, &j->it.bpos));
1864 	INSIST(j->it.bpos.serial == begin_serial);
1865 
1866 	CHECK(journal_find(j, end_serial, &j->it.epos));
1867 	INSIST(j->it.epos.serial == end_serial);
1868 
1869 	if (xfrsizep != NULL) {
1870 		journal_pos_t pos = j->it.bpos;
1871 		journal_xhdr_t xhdr;
1872 		uint64_t size = 0;
1873 		uint32_t count = 0;
1874 
1875 		/*
1876 		 * We already know the beginning and ending serial
1877 		 * numbers are in the journal. Scan through them,
1878 		 * adding up sizes and RR counts so we can calculate
1879 		 * the IXFR size.
1880 		 */
1881 		do {
1882 			CHECK(journal_seek(j, pos.offset));
1883 			CHECK(journal_read_xhdr(j, &xhdr));
1884 
1885 			if (j->header_ver1) {
1886 				CHECK(maybe_fixup_xhdr(j, &xhdr, pos.serial,
1887 						       pos.offset));
1888 			}
1889 
1890 			/*
1891 			 * Check that xhdr is consistent.
1892 			 */
1893 			if (xhdr.serial0 != pos.serial ||
1894 			    isc_serial_le(xhdr.serial1, xhdr.serial0)) {
1895 				CHECK(ISC_R_UNEXPECTED);
1896 			}
1897 
1898 			size += xhdr.size;
1899 			count += xhdr.count;
1900 
1901 			result = journal_next(j, &pos);
1902 			if (result == ISC_R_NOMORE) {
1903 				result = ISC_R_SUCCESS;
1904 			}
1905 			CHECK(result);
1906 		} while (pos.serial != end_serial);
1907 
1908 		/*
1909 		 * For each RR, subtract the length of the RR header,
1910 		 * as this would not be present in IXFR messages.
1911 		 * (We don't need to worry about the transaction header
1912 		 * because that was already excluded from xdr.size.)
1913 		 */
1914 		*xfrsizep = size - (count * sizeof(journal_rawrrhdr_t));
1915 	}
1916 
1917 	result = ISC_R_SUCCESS;
1918 failure:
1919 	j->it.result = result;
1920 	return (j->it.result);
1921 }
1922 
1923 isc_result_t
dns_journal_first_rr(dns_journal_t * j)1924 dns_journal_first_rr(dns_journal_t *j) {
1925 	isc_result_t result;
1926 
1927 	/*
1928 	 * Seek to the beginning of the first transaction we are
1929 	 * interested in.
1930 	 */
1931 	CHECK(journal_seek(j, j->it.bpos.offset));
1932 	j->it.current_serial = j->it.bpos.serial;
1933 
1934 	j->it.xsize = 0; /* We have no transaction data yet... */
1935 	j->it.xpos = 0;	 /* ...and haven't used any of it. */
1936 
1937 	return (read_one_rr(j));
1938 
1939 failure:
1940 	return (result);
1941 }
1942 
1943 static isc_result_t
read_one_rr(dns_journal_t * j)1944 read_one_rr(dns_journal_t *j) {
1945 	isc_result_t result;
1946 	dns_rdatatype_t rdtype;
1947 	dns_rdataclass_t rdclass;
1948 	unsigned int rdlen;
1949 	uint32_t ttl;
1950 	journal_xhdr_t xhdr;
1951 	journal_rrhdr_t rrhdr;
1952 	dns_journal_t save = *j;
1953 
1954 	if (j->offset > j->it.epos.offset) {
1955 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1956 			      "%s: journal corrupt: possible integer overflow",
1957 			      j->filename);
1958 		return (ISC_R_UNEXPECTED);
1959 	}
1960 	if (j->offset == j->it.epos.offset) {
1961 		return (ISC_R_NOMORE);
1962 	}
1963 	if (j->it.xpos == j->it.xsize) {
1964 		/*
1965 		 * We are at a transaction boundary.
1966 		 * Read another transaction header.
1967 		 */
1968 		CHECK(journal_read_xhdr(j, &xhdr));
1969 		if (xhdr.size == 0) {
1970 			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1971 				      "%s: journal corrupt: empty transaction",
1972 				      j->filename);
1973 			FAIL(ISC_R_UNEXPECTED);
1974 		}
1975 
1976 		if (j->header_ver1) {
1977 			CHECK(maybe_fixup_xhdr(j, &xhdr, j->it.current_serial,
1978 					       save.offset));
1979 		}
1980 
1981 		if (xhdr.serial0 != j->it.current_serial ||
1982 		    isc_serial_le(xhdr.serial1, xhdr.serial0))
1983 		{
1984 			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1985 				      "%s: journal file corrupt: "
1986 				      "expected serial %u, got %u",
1987 				      j->filename, j->it.current_serial,
1988 				      xhdr.serial0);
1989 			FAIL(ISC_R_UNEXPECTED);
1990 		}
1991 
1992 		j->it.xsize = xhdr.size;
1993 		j->it.xpos = 0;
1994 	}
1995 	/*
1996 	 * Read an RR.
1997 	 */
1998 	CHECK(journal_read_rrhdr(j, &rrhdr));
1999 	/*
2000 	 * Perform a sanity check on the journal RR size.
2001 	 * The smallest possible RR has a 1-byte owner name
2002 	 * and a 10-byte header.  The largest possible
2003 	 * RR has 65535 bytes of data, a header, and a maximum-
2004 	 * size owner name, well below 70 k total.
2005 	 */
2006 	if (rrhdr.size < 1 + 10 || rrhdr.size > 70000) {
2007 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
2008 			      "%s: journal corrupt: impossible RR size "
2009 			      "(%d bytes)",
2010 			      j->filename, rrhdr.size);
2011 		FAIL(ISC_R_UNEXPECTED);
2012 	}
2013 
2014 	CHECK(size_buffer(j->mctx, &j->it.source, rrhdr.size));
2015 	CHECK(journal_read(j, j->it.source.base, rrhdr.size));
2016 	isc_buffer_add(&j->it.source, rrhdr.size);
2017 
2018 	/*
2019 	 * The target buffer is made the same size
2020 	 * as the source buffer, with the assumption that when
2021 	 * no compression in present, the output of dns_*_fromwire()
2022 	 * is no larger than the input.
2023 	 */
2024 	CHECK(size_buffer(j->mctx, &j->it.target, rrhdr.size));
2025 
2026 	/*
2027 	 * Parse the owner name.  We don't know where it
2028 	 * ends yet, so we make the entire "remaining"
2029 	 * part of the buffer "active".
2030 	 */
2031 	isc_buffer_setactive(&j->it.source,
2032 			     j->it.source.used - j->it.source.current);
2033 	CHECK(dns_name_fromwire(&j->it.name, &j->it.source, &j->it.dctx, 0,
2034 				&j->it.target));
2035 
2036 	/*
2037 	 * Check that the RR header is there, and parse it.
2038 	 */
2039 	if (isc_buffer_remaininglength(&j->it.source) < 10) {
2040 		FAIL(DNS_R_FORMERR);
2041 	}
2042 
2043 	rdtype = isc_buffer_getuint16(&j->it.source);
2044 	rdclass = isc_buffer_getuint16(&j->it.source);
2045 	ttl = isc_buffer_getuint32(&j->it.source);
2046 	rdlen = isc_buffer_getuint16(&j->it.source);
2047 
2048 	if (rdlen > DNS_RDATA_MAXLENGTH) {
2049 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
2050 			      "%s: journal corrupt: impossible rdlen "
2051 			      "(%u bytes)",
2052 			      j->filename, rdlen);
2053 		FAIL(ISC_R_FAILURE);
2054 	}
2055 
2056 	/*
2057 	 * Parse the rdata.
2058 	 */
2059 	if (isc_buffer_remaininglength(&j->it.source) != rdlen) {
2060 		FAIL(DNS_R_FORMERR);
2061 	}
2062 	isc_buffer_setactive(&j->it.source, rdlen);
2063 	dns_rdata_reset(&j->it.rdata);
2064 	CHECK(dns_rdata_fromwire(&j->it.rdata, rdclass, rdtype, &j->it.source,
2065 				 &j->it.dctx, 0, &j->it.target));
2066 	j->it.ttl = ttl;
2067 
2068 	j->it.xpos += sizeof(journal_rawrrhdr_t) + rrhdr.size;
2069 	if (rdtype == dns_rdatatype_soa) {
2070 		/* XXX could do additional consistency checks here */
2071 		j->it.current_serial = dns_soa_getserial(&j->it.rdata);
2072 	}
2073 
2074 	result = ISC_R_SUCCESS;
2075 
2076 failure:
2077 	j->it.result = result;
2078 	return (result);
2079 }
2080 
2081 isc_result_t
dns_journal_next_rr(dns_journal_t * j)2082 dns_journal_next_rr(dns_journal_t *j) {
2083 	j->it.result = read_one_rr(j);
2084 	return (j->it.result);
2085 }
2086 
2087 void
dns_journal_current_rr(dns_journal_t * j,dns_name_t ** name,uint32_t * ttl,dns_rdata_t ** rdata)2088 dns_journal_current_rr(dns_journal_t *j, dns_name_t **name, uint32_t *ttl,
2089 		       dns_rdata_t **rdata) {
2090 	REQUIRE(j->it.result == ISC_R_SUCCESS);
2091 	*name = &j->it.name;
2092 	*ttl = j->it.ttl;
2093 	*rdata = &j->it.rdata;
2094 }
2095 
2096 /**************************************************************************/
2097 /*
2098  * Generating diffs from databases
2099  */
2100 
2101 /*
2102  * Construct a diff containing all the RRs at the current name of the
2103  * database iterator 'dbit' in database 'db', version 'ver'.
2104  * Set '*name' to the current name, and append the diff to 'diff'.
2105  * All new tuples will have the operation 'op'.
2106  *
2107  * Requires: 'name' must have buffer large enough to hold the name.
2108  * Typically, a dns_fixedname_t would be used.
2109  */
2110 static isc_result_t
get_name_diff(dns_db_t * db,dns_dbversion_t * ver,isc_stdtime_t now,dns_dbiterator_t * dbit,dns_name_t * name,dns_diffop_t op,dns_diff_t * diff)2111 get_name_diff(dns_db_t *db, dns_dbversion_t *ver, isc_stdtime_t now,
2112 	      dns_dbiterator_t *dbit, dns_name_t *name, dns_diffop_t op,
2113 	      dns_diff_t *diff) {
2114 	isc_result_t result;
2115 	dns_dbnode_t *node = NULL;
2116 	dns_rdatasetiter_t *rdsiter = NULL;
2117 	dns_difftuple_t *tuple = NULL;
2118 
2119 	result = dns_dbiterator_current(dbit, &node, name);
2120 	if (result != ISC_R_SUCCESS) {
2121 		return (result);
2122 	}
2123 
2124 	result = dns_db_allrdatasets(db, node, ver, now, &rdsiter);
2125 	if (result != ISC_R_SUCCESS) {
2126 		goto cleanup_node;
2127 	}
2128 
2129 	for (result = dns_rdatasetiter_first(rdsiter); result == ISC_R_SUCCESS;
2130 	     result = dns_rdatasetiter_next(rdsiter))
2131 	{
2132 		dns_rdataset_t rdataset;
2133 
2134 		dns_rdataset_init(&rdataset);
2135 		dns_rdatasetiter_current(rdsiter, &rdataset);
2136 
2137 		for (result = dns_rdataset_first(&rdataset);
2138 		     result == ISC_R_SUCCESS;
2139 		     result = dns_rdataset_next(&rdataset))
2140 		{
2141 			dns_rdata_t rdata = DNS_RDATA_INIT;
2142 			dns_rdataset_current(&rdataset, &rdata);
2143 			result = dns_difftuple_create(diff->mctx, op, name,
2144 						      rdataset.ttl, &rdata,
2145 						      &tuple);
2146 			if (result != ISC_R_SUCCESS) {
2147 				dns_rdataset_disassociate(&rdataset);
2148 				goto cleanup_iterator;
2149 			}
2150 			dns_diff_append(diff, &tuple);
2151 		}
2152 		dns_rdataset_disassociate(&rdataset);
2153 		if (result != ISC_R_NOMORE) {
2154 			goto cleanup_iterator;
2155 		}
2156 	}
2157 	if (result != ISC_R_NOMORE) {
2158 		goto cleanup_iterator;
2159 	}
2160 
2161 	result = ISC_R_SUCCESS;
2162 
2163 cleanup_iterator:
2164 	dns_rdatasetiter_destroy(&rdsiter);
2165 
2166 cleanup_node:
2167 	dns_db_detachnode(db, &node);
2168 
2169 	return (result);
2170 }
2171 
2172 /*
2173  * Comparison function for use by dns_diff_subtract when sorting
2174  * the diffs to be subtracted.  The sort keys are the rdata type
2175  * and the rdata itself.  The owner name is ignored, because
2176  * it is known to be the same for all tuples.
2177  */
2178 static int
rdata_order(const void * av,const void * bv)2179 rdata_order(const void *av, const void *bv) {
2180 	dns_difftuple_t const *const *ap = av;
2181 	dns_difftuple_t const *const *bp = bv;
2182 	dns_difftuple_t const *a = *ap;
2183 	dns_difftuple_t const *b = *bp;
2184 	int r;
2185 	r = (b->rdata.type - a->rdata.type);
2186 	if (r != 0) {
2187 		return (r);
2188 	}
2189 	r = dns_rdata_compare(&a->rdata, &b->rdata);
2190 	return (r);
2191 }
2192 
2193 static isc_result_t
dns_diff_subtract(dns_diff_t diff[2],dns_diff_t * r)2194 dns_diff_subtract(dns_diff_t diff[2], dns_diff_t *r) {
2195 	isc_result_t result;
2196 	dns_difftuple_t *p[2];
2197 	int i, t;
2198 	bool append;
2199 	dns_difftuplelist_t add, del;
2200 
2201 	CHECK(dns_diff_sort(&diff[0], rdata_order));
2202 	CHECK(dns_diff_sort(&diff[1], rdata_order));
2203 	ISC_LIST_INIT(add);
2204 	ISC_LIST_INIT(del);
2205 
2206 	for (;;) {
2207 		p[0] = ISC_LIST_HEAD(diff[0].tuples);
2208 		p[1] = ISC_LIST_HEAD(diff[1].tuples);
2209 		if (p[0] == NULL && p[1] == NULL) {
2210 			break;
2211 		}
2212 
2213 		for (i = 0; i < 2; i++) {
2214 			if (p[!i] == NULL) {
2215 				dns_difftuplelist_t *l = (i == 0) ? &add : &del;
2216 				ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
2217 				ISC_LIST_APPEND(*l, p[i], link);
2218 				goto next;
2219 			}
2220 		}
2221 		t = rdata_order(&p[0], &p[1]);
2222 		if (t < 0) {
2223 			ISC_LIST_UNLINK(diff[0].tuples, p[0], link);
2224 			ISC_LIST_APPEND(add, p[0], link);
2225 			goto next;
2226 		}
2227 		if (t > 0) {
2228 			ISC_LIST_UNLINK(diff[1].tuples, p[1], link);
2229 			ISC_LIST_APPEND(del, p[1], link);
2230 			goto next;
2231 		}
2232 		INSIST(t == 0);
2233 		/*
2234 		 * Identical RRs in both databases; skip them both
2235 		 * if the ttl differs.
2236 		 */
2237 		append = (p[0]->ttl != p[1]->ttl);
2238 		for (i = 0; i < 2; i++) {
2239 			ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
2240 			if (append) {
2241 				dns_difftuplelist_t *l = (i == 0) ? &add : &del;
2242 				ISC_LIST_APPEND(*l, p[i], link);
2243 			} else {
2244 				dns_difftuple_free(&p[i]);
2245 			}
2246 		}
2247 	next:;
2248 	}
2249 	ISC_LIST_APPENDLIST(r->tuples, del, link);
2250 	ISC_LIST_APPENDLIST(r->tuples, add, link);
2251 	result = ISC_R_SUCCESS;
2252 failure:
2253 	return (result);
2254 }
2255 
2256 static isc_result_t
diff_namespace(dns_db_t * dba,dns_dbversion_t * dbvera,dns_db_t * dbb,dns_dbversion_t * dbverb,unsigned int options,dns_diff_t * resultdiff)2257 diff_namespace(dns_db_t *dba, dns_dbversion_t *dbvera, dns_db_t *dbb,
2258 	       dns_dbversion_t *dbverb, unsigned int options,
2259 	       dns_diff_t *resultdiff) {
2260 	dns_db_t *db[2];
2261 	dns_dbversion_t *ver[2];
2262 	dns_dbiterator_t *dbit[2] = { NULL, NULL };
2263 	bool have[2] = { false, false };
2264 	dns_fixedname_t fixname[2];
2265 	isc_result_t result, itresult[2];
2266 	dns_diff_t diff[2];
2267 	int i, t;
2268 
2269 	db[0] = dba, db[1] = dbb;
2270 	ver[0] = dbvera, ver[1] = dbverb;
2271 
2272 	dns_diff_init(resultdiff->mctx, &diff[0]);
2273 	dns_diff_init(resultdiff->mctx, &diff[1]);
2274 
2275 	dns_fixedname_init(&fixname[0]);
2276 	dns_fixedname_init(&fixname[1]);
2277 
2278 	result = dns_db_createiterator(db[0], options, &dbit[0]);
2279 	if (result != ISC_R_SUCCESS) {
2280 		return (result);
2281 	}
2282 	result = dns_db_createiterator(db[1], options, &dbit[1]);
2283 	if (result != ISC_R_SUCCESS) {
2284 		goto cleanup_iterator;
2285 	}
2286 
2287 	itresult[0] = dns_dbiterator_first(dbit[0]);
2288 	itresult[1] = dns_dbiterator_first(dbit[1]);
2289 
2290 	for (;;) {
2291 		for (i = 0; i < 2; i++) {
2292 			if (!have[i] && itresult[i] == ISC_R_SUCCESS) {
2293 				CHECK(get_name_diff(
2294 					db[i], ver[i], 0, dbit[i],
2295 					dns_fixedname_name(&fixname[i]),
2296 					i == 0 ? DNS_DIFFOP_ADD
2297 					       : DNS_DIFFOP_DEL,
2298 					&diff[i]));
2299 				itresult[i] = dns_dbiterator_next(dbit[i]);
2300 				have[i] = true;
2301 			}
2302 		}
2303 
2304 		if (!have[0] && !have[1]) {
2305 			INSIST(ISC_LIST_EMPTY(diff[0].tuples));
2306 			INSIST(ISC_LIST_EMPTY(diff[1].tuples));
2307 			break;
2308 		}
2309 
2310 		for (i = 0; i < 2; i++) {
2311 			if (!have[!i]) {
2312 				ISC_LIST_APPENDLIST(resultdiff->tuples,
2313 						    diff[i].tuples, link);
2314 				INSIST(ISC_LIST_EMPTY(diff[i].tuples));
2315 				have[i] = false;
2316 				goto next;
2317 			}
2318 		}
2319 
2320 		t = dns_name_compare(dns_fixedname_name(&fixname[0]),
2321 				     dns_fixedname_name(&fixname[1]));
2322 		if (t < 0) {
2323 			ISC_LIST_APPENDLIST(resultdiff->tuples, diff[0].tuples,
2324 					    link);
2325 			INSIST(ISC_LIST_EMPTY(diff[0].tuples));
2326 			have[0] = false;
2327 			continue;
2328 		}
2329 		if (t > 0) {
2330 			ISC_LIST_APPENDLIST(resultdiff->tuples, diff[1].tuples,
2331 					    link);
2332 			INSIST(ISC_LIST_EMPTY(diff[1].tuples));
2333 			have[1] = false;
2334 			continue;
2335 		}
2336 		INSIST(t == 0);
2337 		CHECK(dns_diff_subtract(diff, resultdiff));
2338 		INSIST(ISC_LIST_EMPTY(diff[0].tuples));
2339 		INSIST(ISC_LIST_EMPTY(diff[1].tuples));
2340 		have[0] = have[1] = false;
2341 	next:;
2342 	}
2343 	if (itresult[0] != ISC_R_NOMORE) {
2344 		FAIL(itresult[0]);
2345 	}
2346 	if (itresult[1] != ISC_R_NOMORE) {
2347 		FAIL(itresult[1]);
2348 	}
2349 
2350 	INSIST(ISC_LIST_EMPTY(diff[0].tuples));
2351 	INSIST(ISC_LIST_EMPTY(diff[1].tuples));
2352 
2353 failure:
2354 	dns_dbiterator_destroy(&dbit[1]);
2355 
2356 cleanup_iterator:
2357 	dns_dbiterator_destroy(&dbit[0]);
2358 	dns_diff_clear(&diff[0]);
2359 	dns_diff_clear(&diff[1]);
2360 	return (result);
2361 }
2362 
2363 /*
2364  * Compare the databases 'dba' and 'dbb' and generate a journal
2365  * entry containing the changes to make 'dba' from 'dbb' (note
2366  * the order).  This journal entry will consist of a single,
2367  * possibly very large transaction.
2368  */
2369 isc_result_t
dns_db_diff(isc_mem_t * mctx,dns_db_t * dba,dns_dbversion_t * dbvera,dns_db_t * dbb,dns_dbversion_t * dbverb,const char * filename)2370 dns_db_diff(isc_mem_t *mctx, dns_db_t *dba, dns_dbversion_t *dbvera,
2371 	    dns_db_t *dbb, dns_dbversion_t *dbverb, const char *filename) {
2372 	isc_result_t result;
2373 	dns_diff_t diff;
2374 
2375 	dns_diff_init(mctx, &diff);
2376 
2377 	result = dns_db_diffx(&diff, dba, dbvera, dbb, dbverb, filename);
2378 
2379 	dns_diff_clear(&diff);
2380 
2381 	return (result);
2382 }
2383 
2384 isc_result_t
dns_db_diffx(dns_diff_t * diff,dns_db_t * dba,dns_dbversion_t * dbvera,dns_db_t * dbb,dns_dbversion_t * dbverb,const char * filename)2385 dns_db_diffx(dns_diff_t *diff, dns_db_t *dba, dns_dbversion_t *dbvera,
2386 	     dns_db_t *dbb, dns_dbversion_t *dbverb, const char *filename) {
2387 	isc_result_t result;
2388 	dns_journal_t *journal = NULL;
2389 
2390 	if (filename != NULL) {
2391 		result = dns_journal_open(diff->mctx, filename,
2392 					  DNS_JOURNAL_CREATE, &journal);
2393 		if (result != ISC_R_SUCCESS) {
2394 			return (result);
2395 		}
2396 	}
2397 
2398 	CHECK(diff_namespace(dba, dbvera, dbb, dbverb, DNS_DB_NONSEC3, diff));
2399 	CHECK(diff_namespace(dba, dbvera, dbb, dbverb, DNS_DB_NSEC3ONLY, diff));
2400 
2401 	if (journal != NULL) {
2402 		if (ISC_LIST_EMPTY(diff->tuples)) {
2403 			isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no changes");
2404 		} else {
2405 			CHECK(dns_journal_write_transaction(journal, diff));
2406 		}
2407 	}
2408 
2409 failure:
2410 	if (journal != NULL) {
2411 		dns_journal_destroy(&journal);
2412 	}
2413 	return (result);
2414 }
2415 
2416 static uint32_t
rrcount(unsigned char * buf,unsigned int size)2417 rrcount(unsigned char *buf, unsigned int size) {
2418 	isc_buffer_t b;
2419 	uint32_t rrsize, count = 0;
2420 
2421 	isc_buffer_init(&b, buf, size);
2422 	isc_buffer_add(&b, size);
2423 	while (isc_buffer_remaininglength(&b) > 0) {
2424 		rrsize = isc_buffer_getuint32(&b);
2425 		INSIST(isc_buffer_remaininglength(&b) >= rrsize);
2426 		isc_buffer_forward(&b, rrsize);
2427 		count++;
2428 	}
2429 
2430 	return (count);
2431 }
2432 
2433 static bool
check_delta(unsigned char * buf,size_t size)2434 check_delta(unsigned char *buf, size_t size) {
2435 	isc_buffer_t b;
2436 	uint32_t rrsize;
2437 
2438 	isc_buffer_init(&b, buf, size);
2439 	isc_buffer_add(&b, size);
2440 	while (isc_buffer_remaininglength(&b) > 0) {
2441 		if (isc_buffer_remaininglength(&b) < 4) {
2442 			return (false);
2443 		}
2444 		rrsize = isc_buffer_getuint32(&b);
2445 		/* "." + type + class + ttl + rdlen => 11U */
2446 		if (rrsize < 11U || isc_buffer_remaininglength(&b) < rrsize) {
2447 			return (false);
2448 		}
2449 		isc_buffer_forward(&b, rrsize);
2450 	}
2451 
2452 	return (true);
2453 }
2454 
2455 isc_result_t
dns_journal_compact(isc_mem_t * mctx,char * filename,uint32_t serial,uint32_t flags,uint32_t target_size)2456 dns_journal_compact(isc_mem_t *mctx, char *filename, uint32_t serial,
2457 		    uint32_t flags, uint32_t target_size) {
2458 	unsigned int i;
2459 	journal_pos_t best_guess;
2460 	journal_pos_t current_pos;
2461 	dns_journal_t *j1 = NULL;
2462 	dns_journal_t *j2 = NULL;
2463 	journal_rawheader_t rawheader;
2464 	unsigned int len;
2465 	size_t namelen;
2466 	unsigned char *buf = NULL;
2467 	unsigned int size = 0;
2468 	isc_result_t result;
2469 	unsigned int indexend;
2470 	char newname[PATH_MAX];
2471 	char backup[PATH_MAX];
2472 	bool is_backup = false;
2473 	bool rewrite = false;
2474 	bool downgrade = false;
2475 
2476 	REQUIRE(filename != NULL);
2477 
2478 	namelen = strlen(filename);
2479 	if (namelen > 4U && strcmp(filename + namelen - 4, ".jnl") == 0) {
2480 		namelen -= 4;
2481 	}
2482 
2483 	result = snprintf(newname, sizeof(newname), "%.*s.jnw", (int)namelen,
2484 			  filename);
2485 	RUNTIME_CHECK(result < sizeof(newname));
2486 
2487 	result = snprintf(backup, sizeof(backup), "%.*s.jbk", (int)namelen,
2488 			  filename);
2489 	RUNTIME_CHECK(result < sizeof(backup));
2490 
2491 	result = journal_open(mctx, filename, false, false, false, &j1);
2492 	if (result == ISC_R_NOTFOUND) {
2493 		is_backup = true;
2494 		result = journal_open(mctx, backup, false, false, false, &j1);
2495 	}
2496 	if (result != ISC_R_SUCCESS) {
2497 		return (result);
2498 	}
2499 
2500 	/*
2501 	 * Always perform a re-write when processing a version 1 journal.
2502 	 */
2503 	rewrite = j1->header_ver1;
2504 
2505 	/*
2506 	 * Check whether we need to rewrite the whole journal
2507 	 * file (for example, to upversion it).
2508 	 */
2509 	if ((flags & DNS_JOURNAL_COMPACTALL) != 0) {
2510 		if ((flags & DNS_JOURNAL_VERSION1) != 0) {
2511 			downgrade = true;
2512 		}
2513 		rewrite = true;
2514 		serial = dns_journal_first_serial(j1);
2515 	} else if (JOURNAL_EMPTY(&j1->header)) {
2516 		dns_journal_destroy(&j1);
2517 		return (ISC_R_SUCCESS);
2518 	}
2519 
2520 	if (DNS_SERIAL_GT(j1->header.begin.serial, serial) ||
2521 	    DNS_SERIAL_GT(serial, j1->header.end.serial))
2522 	{
2523 		dns_journal_destroy(&j1);
2524 		return (ISC_R_RANGE);
2525 	}
2526 
2527 	/*
2528 	 * Cope with very small target sizes.
2529 	 */
2530 	indexend = sizeof(journal_rawheader_t) +
2531 		   j1->header.index_size * sizeof(journal_rawpos_t);
2532 	if (target_size < DNS_JOURNAL_SIZE_MIN) {
2533 		target_size = DNS_JOURNAL_SIZE_MIN;
2534 	}
2535 	if (target_size < indexend * 2) {
2536 		target_size = target_size / 2 + indexend;
2537 	}
2538 
2539 	/*
2540 	 * See if there is any work to do.
2541 	 */
2542 	if (!rewrite && (uint32_t)j1->header.end.offset < target_size) {
2543 		dns_journal_destroy(&j1);
2544 		return (ISC_R_SUCCESS);
2545 	}
2546 
2547 	CHECK(journal_open(mctx, newname, true, true, downgrade, &j2));
2548 	CHECK(journal_seek(j2, indexend));
2549 
2550 	/*
2551 	 * Remove overhead so space test below can succeed.
2552 	 */
2553 	if (target_size >= indexend) {
2554 		target_size -= indexend;
2555 	}
2556 
2557 	/*
2558 	 * Find if we can create enough free space.
2559 	 */
2560 	best_guess = j1->header.begin;
2561 	for (i = 0; i < j1->header.index_size; i++) {
2562 		if (POS_VALID(j1->index[i]) &&
2563 		    DNS_SERIAL_GE(serial, j1->index[i].serial) &&
2564 		    ((uint32_t)(j1->header.end.offset - j1->index[i].offset) >=
2565 		     target_size / 2) &&
2566 		    j1->index[i].offset > best_guess.offset)
2567 		{
2568 			best_guess = j1->index[i];
2569 		}
2570 	}
2571 
2572 	current_pos = best_guess;
2573 	while (current_pos.serial != serial) {
2574 		CHECK(journal_next(j1, &current_pos));
2575 		if (current_pos.serial == j1->header.end.serial) {
2576 			break;
2577 		}
2578 
2579 		if (DNS_SERIAL_GE(serial, current_pos.serial) &&
2580 		    ((uint32_t)(j1->header.end.offset - current_pos.offset) >=
2581 		     (target_size / 2)) &&
2582 		    current_pos.offset > best_guess.offset)
2583 		{
2584 			best_guess = current_pos;
2585 		} else {
2586 			break;
2587 		}
2588 	}
2589 
2590 	INSIST(best_guess.serial != j1->header.end.serial);
2591 	if (best_guess.serial != serial) {
2592 		CHECK(journal_next(j1, &best_guess));
2593 		serial = best_guess.serial;
2594 	}
2595 
2596 	/*
2597 	 * We should now be roughly half target_size provided
2598 	 * we did not reach 'serial'.  If not we will just copy
2599 	 * all uncommitted deltas regardless of the size.
2600 	 */
2601 	len = j1->header.end.offset - best_guess.offset;
2602 	if (len != 0) {
2603 		CHECK(journal_seek(j1, best_guess.offset));
2604 
2605 		/* Prepare new header */
2606 		j2->header.begin.serial = best_guess.serial;
2607 		j2->header.begin.offset = indexend;
2608 		j2->header.sourceserial = j1->header.sourceserial;
2609 		j2->header.serialset = j1->header.serialset;
2610 		j2->header.end.serial = j1->header.end.serial;
2611 
2612 		/*
2613 		 * Only use this method if we're rewriting the
2614 		 * journal to fix outdated transaction headers;
2615 		 * otherwise we'll copy the whole journal without
2616 		 * parsing individual deltas below.
2617 		 */
2618 		while (rewrite && len > 0) {
2619 			journal_xhdr_t xhdr;
2620 			isc_offset_t offset = j1->offset;
2621 			uint32_t count;
2622 
2623 			result = journal_read_xhdr(j1, &xhdr);
2624 			if (rewrite && result == ISC_R_NOMORE) {
2625 				break;
2626 			}
2627 			CHECK(result);
2628 
2629 			size = xhdr.size;
2630 			if (size > len) {
2631 				isc_log_write(JOURNAL_COMMON_LOGARGS,
2632 					      ISC_LOG_ERROR,
2633 					      "%s: journal file corrupt, "
2634 					      "transaction too large",
2635 					      j1->filename);
2636 				CHECK(ISC_R_FAILURE);
2637 			}
2638 			buf = isc_mem_get(mctx, size);
2639 			result = journal_read(j1, buf, size);
2640 
2641 			/*
2642 			 * If we're repairing an outdated journal, the
2643 			 * xhdr format may be wrong.
2644 			 */
2645 			if (rewrite && (result != ISC_R_SUCCESS ||
2646 					!check_delta(buf, size))) {
2647 				if (j1->xhdr_version == XHDR_VERSION2) {
2648 					/* XHDR_VERSION2 -> XHDR_VERSION1 */
2649 					j1->xhdr_version = XHDR_VERSION1;
2650 					CHECK(journal_seek(j1, offset));
2651 					CHECK(journal_read_xhdr(j1, &xhdr));
2652 				} else if (j1->xhdr_version == XHDR_VERSION1) {
2653 					/* XHDR_VERSION1 -> XHDR_VERSION2 */
2654 					j1->xhdr_version = XHDR_VERSION2;
2655 					CHECK(journal_seek(j1, offset));
2656 					CHECK(journal_read_xhdr(j1, &xhdr));
2657 				}
2658 
2659 				/* Check again */
2660 				isc_mem_put(mctx, buf, size);
2661 				size = xhdr.size;
2662 				if (size > len) {
2663 					isc_log_write(
2664 						JOURNAL_COMMON_LOGARGS,
2665 						ISC_LOG_ERROR,
2666 						"%s: journal file corrupt, "
2667 						"transaction too large",
2668 						j1->filename);
2669 					CHECK(ISC_R_FAILURE);
2670 				}
2671 				buf = isc_mem_get(mctx, size);
2672 				CHECK(journal_read(j1, buf, size));
2673 
2674 				if (!check_delta(buf, size)) {
2675 					CHECK(ISC_R_UNEXPECTED);
2676 				}
2677 			} else {
2678 				CHECK(result);
2679 			}
2680 
2681 			/*
2682 			 * Recover from incorrectly written transaction header.
2683 			 * The incorrect header was written as size, serial0,
2684 			 * serial1, and 0.  XHDR_VERSION2 is expecting size,
2685 			 * count, serial0, and serial1.
2686 			 */
2687 			if (j1->xhdr_version == XHDR_VERSION2 &&
2688 			    xhdr.count == serial && xhdr.serial1 == 0U &&
2689 			    isc_serial_gt(xhdr.serial0, xhdr.count))
2690 			{
2691 				xhdr.serial1 = xhdr.serial0;
2692 				xhdr.serial0 = xhdr.count;
2693 				xhdr.count = 0;
2694 			}
2695 
2696 			/*
2697 			 * Check that xhdr is consistent.
2698 			 */
2699 			if (xhdr.serial0 != serial ||
2700 			    isc_serial_le(xhdr.serial1, xhdr.serial0)) {
2701 				CHECK(ISC_R_UNEXPECTED);
2702 			}
2703 
2704 			/*
2705 			 * Extract record count from the transaction.  This
2706 			 * is needed when converting from XHDR_VERSION1 to
2707 			 * XHDR_VERSION2, and when recovering from an
2708 			 * incorrectly written XHDR_VERSION2.
2709 			 */
2710 			count = rrcount(buf, size);
2711 			CHECK(journal_write_xhdr(j2, xhdr.size, count,
2712 						 xhdr.serial0, xhdr.serial1));
2713 			CHECK(journal_write(j2, buf, size));
2714 
2715 			j2->header.end.offset = j2->offset;
2716 
2717 			serial = xhdr.serial1;
2718 
2719 			len = j1->header.end.offset - j1->offset;
2720 			isc_mem_put(mctx, buf, size);
2721 		}
2722 
2723 		/*
2724 		 * If we're not rewriting transaction headers, we can use
2725 		 * this faster method instead.
2726 		 */
2727 		if (!rewrite) {
2728 			size = ISC_MIN(64 * 1024, len);
2729 			buf = isc_mem_get(mctx, size);
2730 			for (i = 0; i < len; i += size) {
2731 				unsigned int blob = ISC_MIN(size, len - i);
2732 				CHECK(journal_read(j1, buf, blob));
2733 				CHECK(journal_write(j2, buf, blob));
2734 			}
2735 
2736 			j2->header.end.offset = indexend + len;
2737 		}
2738 
2739 		CHECK(journal_fsync(j2));
2740 
2741 		/*
2742 		 * Update the journal header.
2743 		 */
2744 		journal_header_encode(&j2->header, &rawheader);
2745 		CHECK(journal_seek(j2, 0));
2746 		CHECK(journal_write(j2, &rawheader, sizeof(rawheader)));
2747 		CHECK(journal_fsync(j2));
2748 
2749 		/*
2750 		 * Build new index.
2751 		 */
2752 		current_pos = j2->header.begin;
2753 		while (current_pos.serial != j2->header.end.serial) {
2754 			index_add(j2, &current_pos);
2755 			CHECK(journal_next(j2, &current_pos));
2756 		}
2757 
2758 		/*
2759 		 * Write index.
2760 		 */
2761 		CHECK(index_to_disk(j2));
2762 		CHECK(journal_fsync(j2));
2763 
2764 		indexend = j2->header.end.offset;
2765 		POST(indexend);
2766 	}
2767 
2768 	/*
2769 	 * Close both journals before trying to rename files (this is
2770 	 * necessary on WIN32).
2771 	 */
2772 	dns_journal_destroy(&j1);
2773 	dns_journal_destroy(&j2);
2774 
2775 	/*
2776 	 * With a UFS file system this should just succeed and be atomic.
2777 	 * Any IXFR outs will just continue and the old journal will be
2778 	 * removed on final close.
2779 	 *
2780 	 * With MSDOS / NTFS we need to do a two stage rename, triggered
2781 	 * by EEXIST.  (If any IXFR's are running in other threads, however,
2782 	 * this will fail, and the journal will not be compacted.  But
2783 	 * if so, hopefully they'll be finished by the next time we
2784 	 * compact.)
2785 	 */
2786 	if (rename(newname, filename) == -1) {
2787 		if (errno == EEXIST && !is_backup) {
2788 			result = isc_file_remove(backup);
2789 			if (result != ISC_R_SUCCESS &&
2790 			    result != ISC_R_FILENOTFOUND) {
2791 				goto failure;
2792 			}
2793 			if (rename(filename, backup) == -1) {
2794 				goto maperrno;
2795 			}
2796 			if (rename(newname, filename) == -1) {
2797 				goto maperrno;
2798 			}
2799 			(void)isc_file_remove(backup);
2800 		} else {
2801 		maperrno:
2802 			result = ISC_R_FAILURE;
2803 			goto failure;
2804 		}
2805 	}
2806 
2807 	result = ISC_R_SUCCESS;
2808 
2809 failure:
2810 	(void)isc_file_remove(newname);
2811 	if (buf != NULL) {
2812 		isc_mem_put(mctx, buf, size);
2813 	}
2814 	if (j1 != NULL) {
2815 		dns_journal_destroy(&j1);
2816 	}
2817 	if (j2 != NULL) {
2818 		dns_journal_destroy(&j2);
2819 	}
2820 	return (result);
2821 }
2822 
2823 static isc_result_t
index_to_disk(dns_journal_t * j)2824 index_to_disk(dns_journal_t *j) {
2825 	isc_result_t result = ISC_R_SUCCESS;
2826 
2827 	if (j->header.index_size != 0) {
2828 		unsigned int i;
2829 		unsigned char *p;
2830 		unsigned int rawbytes;
2831 
2832 		rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
2833 
2834 		p = j->rawindex;
2835 		for (i = 0; i < j->header.index_size; i++) {
2836 			encode_uint32(j->index[i].serial, p);
2837 			p += 4;
2838 			encode_uint32(j->index[i].offset, p);
2839 			p += 4;
2840 		}
2841 		INSIST(p == j->rawindex + rawbytes);
2842 
2843 		CHECK(journal_seek(j, sizeof(journal_rawheader_t)));
2844 		CHECK(journal_write(j, j->rawindex, rawbytes));
2845 	}
2846 failure:
2847 	return (result);
2848 }
2849