xref: /openbsd/usr.bin/rsync/blocks.c (revision e5dd7070)
1 /*	$Id: blocks.c,v 1.19 2019/06/02 17:43:34 deraadt Exp $ */
2 /*
3  * Copyright (c) 2019 Kristaps Dzonsons <kristaps@bsd.lv>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 #include <sys/queue.h>
18 #include <sys/stat.h>
19 
20 #include <assert.h>
21 #include <endian.h>
22 #include <errno.h>
23 #include <inttypes.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <unistd.h>
28 
29 #include <openssl/md4.h>
30 
31 #include "extern.h"
32 
33 struct	blkhash {
34 	const struct blk	*blk;
35 	TAILQ_ENTRY(blkhash)	 entries;
36 };
37 
38 TAILQ_HEAD(blkhashq, blkhash);
39 
40 /*
41  * Table used by the sender for looking up blocks.
42  * The blocks are those sent by the receiver; we're looking up using
43  * hashes computed from our local file.
44  */
45 struct	blktab {
46 	struct blkhashq	*q; /* entries in the hashtable */
47 	size_t		 qsz; /* size of the hashtable */
48 	struct blkhash	*blks; /* pre-allocated hashtable entries */
49 };
50 
51 /*
52  * This is the number of buckets in the hashtable.
53  * Use the same that GPL rsync uses.
54  * (It should be dynamic?)
55  */
56 #define	BLKTAB_SZ	  65536
57 
58 /*
59  * Initialise an empty hashtable with BLKTAB_SZ entries in it.
60  * Populate it for each file with blkhash_set.
61  * When we've processed all files, call blkhash_free.
62  * Returns NULL on allocation failure.
63  */
64 struct blktab *
65 blkhash_alloc(void)
66 {
67 	struct blktab	*p;
68 
69 	if ((p = calloc(1, sizeof(struct blktab))) == NULL) {
70 		ERR("calloc");
71 		return NULL;
72 	}
73 	p->qsz = BLKTAB_SZ;
74 	p->q = calloc(p->qsz, sizeof(struct blkhashq));
75 	if (p->q == NULL) {
76 		ERR("calloc");
77 		free(p);
78 		return NULL;
79 	}
80 	return p;
81 }
82 
83 /*
84  * Populate the hashtable with an incoming file's blocks.
85  * This will clear out any existing hashed data.
86  * Returns zero on allocation failure, non-zero otherwise.
87  */
88 int
89 blkhash_set(struct blktab *p, const struct blkset *bset)
90 {
91 	size_t	 i, idx;
92 
93 	if (bset == NULL)
94 		return 1;
95 
96 	/* Wipe clean the table. */
97 
98 	for (i = 0; i < p->qsz; i++)
99 		TAILQ_INIT(&p->q[i]);
100 
101 	/* Fill in the hashtable. */
102 
103 	p->blks = reallocarray(p->blks, bset->blksz, sizeof(struct blkhash));
104 	if (p->blks == NULL) {
105 		ERR("reallocarray");
106 		return 0;
107 	}
108 	for (i = 0; i < bset->blksz; i++) {
109 		p->blks[i].blk = &bset->blks[i];
110 		idx = bset->blks[i].chksum_short % p->qsz;
111 		assert(idx < p->qsz);
112 		TAILQ_INSERT_TAIL(&p->q[idx], &p->blks[i], entries);
113 	}
114 
115 	return 1;
116 }
117 
118 /*
119  * Free as allocated with blkhash_alloc().
120  */
121 void
122 blkhash_free(struct blktab *p)
123 {
124 
125 	free(p->blks);
126 	free(p);
127 }
128 
129 /*
130  * From our current position of "offs" in buffer "buf" of total size
131  * "size", see if we can find a matching block in our list of blocks.
132  * The "hint" refers to the block that *might* work.
133  * Returns the blk or NULL if no matching block was found.
134  */
135 static const struct blk *
136 blk_find(struct sess *sess, struct blkstat *st,
137 	const struct blkset *blks, const char *path, int recomp)
138 {
139 	unsigned char	 md[MD4_DIGEST_LENGTH];
140 	uint32_t	 fhash;
141 	off_t		 remain, osz;
142 	int		 have_md = 0;
143 	char		*map;
144 	const struct blkhashq *q;
145 	const struct blkhash *ent;
146 
147 	remain = st->mapsz - st->offs;
148 	assert(remain);
149 	osz = MINIMUM(remain, (off_t)blks->len);
150 
151 	/*
152 	 * First, compute our fast hash the hard way (if we're
153 	 * reentering this function from a previous block match, or the
154 	 * first time) or from our existing s1 and s2 values.
155 	 */
156 
157 	if (!recomp) {
158 		fhash = (st->s1 & 0xFFFF) | (st->s2 << 16);
159 	} else {
160 		fhash = hash_fast(st->map + st->offs, (size_t)osz);
161 		st->s1 = fhash & 0xFFFF;
162 		st->s2 = fhash >> 16;
163 	}
164 
165 	/*
166 	 * Start with our match hint.
167 	 * This just runs the fast and slow check with the hint.
168 	 */
169 
170 	if (st->hint < blks->blksz &&
171 	    fhash == blks->blks[st->hint].chksum_short &&
172 	    (size_t)osz == blks->blks[st->hint].len) {
173 		hash_slow(st->map + st->offs, (size_t)osz, md, sess);
174 		have_md = 1;
175 		if (memcmp(md, blks->blks[st->hint].chksum_long, blks->csum) == 0) {
176 			LOG4("%s: found matching hinted match: "
177 			    "position %jd, block %zu (position %jd, size %zu)",
178 			    path,
179 			    (intmax_t)st->offs, blks->blks[st->hint].idx,
180 			    (intmax_t)blks->blks[st->hint].offs,
181 			    blks->blks[st->hint].len);
182 			return &blks->blks[st->hint];
183 		}
184 	}
185 
186 	/*
187 	 * Look for the fast hash modulus in our hashtable, filter for
188 	 * those matching the full hash and length, then move to the
189 	 * slow hash.
190 	 * The slow hash is computed only once.
191 	 */
192 
193 	q = &st->blktab->q[fhash % st->blktab->qsz];
194 
195 	TAILQ_FOREACH(ent, q, entries) {
196 		if (fhash != ent->blk->chksum_short ||
197 		    (size_t)osz != ent->blk->len)
198 			continue;
199 
200 		LOG4("%s: found matching fast match: "
201 		    "position %jd, block %zu (position %jd, size %zu)",
202 		    path, (intmax_t)st->offs, ent->blk->idx,
203 		    (intmax_t)ent->blk->offs, ent->blk->len);
204 
205 		if (have_md == 0) {
206 			hash_slow(st->map + st->offs, (size_t)osz, md, sess);
207 			have_md = 1;
208 		}
209 
210 		if (memcmp(md, ent->blk->chksum_long, blks->csum))
211 			continue;
212 
213 		LOG4("%s: sender verifies slow match", path);
214 		return ent->blk;
215 	}
216 
217 	/*
218 	 * Adjust our partial sums for the hashing.
219 	 * We first remove the first byte from the sum.
220 	 * We then, if we have space, add the first byte of the next
221 	 * block in the sequence.
222 	 */
223 
224 	map = st->map + st->offs;
225 	st->s1 -= map[0];
226 	st->s2 -= osz * map[0];
227 
228 	if (osz < remain) {
229 		st->s1 += map[osz];
230 		st->s2 += st->s1;
231 	}
232 
233 	return NULL;
234 }
235 
236 /*
237  * Given a local file "path" and the blocks created by a remote machine,
238  * find out which blocks of our file they don't have and send them.
239  * This function is reentrant: it must be called while there's still
240  * data to send.
241  */
242 void
243 blk_match(struct sess *sess, const struct blkset *blks,
244 	const char *path, struct blkstat *st)
245 {
246 	off_t		  last, end, sz;
247 	int32_t		  tok;
248 	size_t		  i;
249 	const struct blk *blk;
250 
251 	/*
252 	 * If the file's empty or we don't have any blocks from the
253 	 * sender, then simply send the whole file.
254 	 * Otherwise, run the hash matching routine and send raw chunks
255 	 * and subsequent matching tokens.
256 	 */
257 
258 	if (st->mapsz && blks->blksz) {
259 		/*
260 		 * Stop searching at the length of the file minus the
261 		 * size of the last block.
262 		 * The reason for this being that we don't need to do an
263 		 * incremental hash within the last block---if it
264 		 * doesn't match, it doesn't match.
265 		 */
266 
267 		end = st->mapsz + 1 - blks->blks[blks->blksz - 1].len;
268 		last = st->offs;
269 
270 		for (i = 0; st->offs < end; st->offs++, i++) {
271 			blk = blk_find(sess, st, blks, path, i == 0);
272 			if (blk == NULL)
273 				continue;
274 
275 			sz = st->offs - last;
276 			st->dirty += sz;
277 			st->total += sz;
278 			LOG4("%s: flushing %jd B before %zu B block %zu",
279 			    path, (intmax_t)sz,
280 			    blk->len, blk->idx);
281 			tok = -(blk->idx + 1);
282 
283 			/*
284 			 * Write the data we have, then follow it with
285 			 * the tag of the block that matches.
286 			 */
287 
288 			st->curpos = last;
289 			st->curlen = st->curpos + sz;
290 			st->curtok = tok;
291 			assert(st->curtok != 0);
292 			st->curst = sz ? BLKSTAT_DATA : BLKSTAT_TOK;
293 			st->total += blk->len;
294 			st->offs += blk->len;
295 			st->hint = blk->idx + 1;
296 			return;
297 		}
298 
299 		/* Emit remaining data and send terminator token. */
300 
301 		sz = st->mapsz - last;
302 		LOG4("%s: flushing remaining %jd B",
303 		    path, (intmax_t)sz);
304 
305 		st->total += sz;
306 		st->dirty += sz;
307 		st->curpos = last;
308 		st->curlen = st->curpos + sz;
309 		st->curtok = 0;
310 		st->curst = sz ? BLKSTAT_DATA : BLKSTAT_TOK;
311 	} else {
312 		st->curpos = 0;
313 		st->curlen = st->mapsz;
314 		st->curtok = 0;
315 		st->curst = st->mapsz ? BLKSTAT_DATA : BLKSTAT_TOK;
316 		st->dirty = st->total = st->mapsz;
317 
318 		LOG4("%s: flushing whole file %zu B",
319 		    path, st->mapsz);
320 	}
321 }
322 
323 /*
324  * Buffer the message from sender to the receiver indicating that the
325  * block set has been received.
326  * Symmetrises blk_send_ack().
327  */
328 void
329 blk_recv_ack(char buf[20], const struct blkset *blocks, int32_t idx)
330 {
331 	size_t	 pos = 0, sz;
332 
333 	sz = sizeof(int32_t) + /* index */
334 	     sizeof(int32_t) + /* block count */
335 	     sizeof(int32_t) + /* block length */
336 	     sizeof(int32_t) + /* checksum length */
337 	     sizeof(int32_t); /* block remainder */
338 	assert(sz == 20);
339 
340 	io_buffer_int(buf, &pos, sz, idx);
341 	io_buffer_int(buf, &pos, sz, blocks->blksz);
342 	io_buffer_int(buf, &pos, sz, blocks->len);
343 	io_buffer_int(buf, &pos, sz, blocks->csum);
344 	io_buffer_int(buf, &pos, sz, blocks->rem);
345 	assert(pos == sz);
346 }
347 
348 /*
349  * Read all of the checksums for a file's blocks.
350  * Returns the set of blocks or NULL on failure.
351  */
352 struct blkset *
353 blk_recv(struct sess *sess, int fd, const char *path)
354 {
355 	struct blkset	*s;
356 	int32_t		 i;
357 	size_t		 j;
358 	struct blk	*b;
359 	off_t		 offs = 0;
360 
361 	if ((s = calloc(1, sizeof(struct blkset))) == NULL) {
362 		ERR("calloc");
363 		return NULL;
364 	}
365 
366 	/*
367 	 * The block prologue consists of a few values that we'll need
368 	 * in reading the individual blocks for this file.
369 	 * FIXME: read into buffer and unbuffer.
370 	 */
371 
372 	if (!io_read_size(sess, fd, &s->blksz)) {
373 		ERRX1("io_read_size");
374 		goto out;
375 	} else if (!io_read_size(sess, fd, &s->len)) {
376 		ERRX1("io_read_size");
377 		goto out;
378 	} else if (!io_read_size(sess, fd, &s->csum)) {
379 		ERRX1("io_read_int");
380 		goto out;
381 	} else if (!io_read_size(sess, fd, &s->rem)) {
382 		ERRX1("io_read_int");
383 		goto out;
384 	} else if (s->rem && s->rem >= s->len) {
385 		ERRX("block remainder is "
386 			"greater than block size");
387 		goto out;
388 	}
389 
390 	LOG3("%s: read block prologue: %zu blocks of "
391 	    "%zu B, %zu B remainder, %zu B checksum", path,
392 	    s->blksz, s->len, s->rem, s->csum);
393 
394 	if (s->blksz) {
395 		s->blks = calloc(s->blksz, sizeof(struct blk));
396 		if (s->blks == NULL) {
397 			ERR("calloc");
398 			goto out;
399 		}
400 	}
401 
402 	/*
403 	 * Read each block individually.
404 	 * FIXME: read buffer and unbuffer.
405 	 */
406 
407 	for (j = 0; j < s->blksz; j++) {
408 		b = &s->blks[j];
409 		if (!io_read_int(sess, fd, &i)) {
410 			ERRX1("io_read_int");
411 			goto out;
412 		}
413 		b->chksum_short = i;
414 
415 		assert(s->csum <= sizeof(b->chksum_long));
416 		if (!io_read_buf(sess,
417 		    fd, b->chksum_long, s->csum)) {
418 			ERRX1("io_read_buf");
419 			goto out;
420 		}
421 
422 		/*
423 		 * If we're the last block, then we're assigned the
424 		 * remainder of the data.
425 		 */
426 
427 		b->offs = offs;
428 		b->idx = j;
429 		b->len = (j == (s->blksz - 1) && s->rem) ?
430 			s->rem : s->len;
431 		offs += b->len;
432 
433 		LOG4("%s: read block %zu, length %zu B",
434 		    path, b->idx, b->len);
435 	}
436 
437 	s->size = offs;
438 	LOG3("%s: read blocks: %zu blocks, %jd B total blocked data",
439 	    path, s->blksz, (intmax_t)s->size);
440 	return s;
441 out:
442 	free(s->blks);
443 	free(s);
444 	return NULL;
445 }
446 
447 /*
448  * Symmetrise blk_recv_ack(), except w/o the leading identifier.
449  * Return zero on failure, non-zero on success.
450  */
451 int
452 blk_send_ack(struct sess *sess, int fd, struct blkset *p)
453 {
454 	char	 buf[16];
455 	size_t	 pos = 0, sz;
456 
457 	/* Put the entire send routine into a buffer. */
458 
459 	sz = sizeof(int32_t) + /* block count */
460 	     sizeof(int32_t) + /* block length */
461 	     sizeof(int32_t) + /* checksum length */
462 	     sizeof(int32_t); /* block remainder */
463 	assert(sz <= sizeof(buf));
464 
465 	if (!io_read_buf(sess, fd, buf, sz)) {
466 		ERRX1("io_read_buf");
467 		return 0;
468 	}
469 
470 	if (!io_unbuffer_size(buf, &pos, sz, &p->blksz))
471 		ERRX1("io_unbuffer_size");
472 	else if (!io_unbuffer_size(buf, &pos, sz, &p->len))
473 		ERRX1("io_unbuffer_size");
474 	else if (!io_unbuffer_size(buf, &pos, sz, &p->csum))
475 		ERRX1("io_unbuffer_size");
476 	else if (!io_unbuffer_size(buf, &pos, sz, &p->rem))
477 		ERRX1("io_unbuffer_size");
478 	else if (p->len && p->rem >= p->len)
479 		ERRX1("non-zero length is less than remainder");
480 	else if (p->csum == 0 || p->csum > 16)
481 		ERRX1("inappropriate checksum length");
482 	else
483 		return 1;
484 
485 	return 0;
486 }
487 
488 /*
489  * Transmit the metadata for set and blocks.
490  * Return zero on failure, non-zero on success.
491  */
492 int
493 blk_send(struct sess *sess, int fd, size_t idx,
494 	const struct blkset *p, const char *path)
495 {
496 	char	*buf;
497 	size_t	 i, pos = 0, sz;
498 	int	 rc = 0;
499 
500 	/* Put the entire send routine into a buffer. */
501 
502 	sz = sizeof(int32_t) + /* identifier */
503 	    sizeof(int32_t) + /* block count */
504 	    sizeof(int32_t) + /* block length */
505 	    sizeof(int32_t) + /* checksum length */
506 	    sizeof(int32_t) + /* block remainder */
507 	    p->blksz *
508 	    (sizeof(int32_t) + /* short checksum */
509 		p->csum); /* long checksum */
510 
511 	if ((buf = malloc(sz)) == NULL) {
512 		ERR("malloc");
513 		return 0;
514 	}
515 
516 	io_buffer_int(buf, &pos, sz, idx);
517 	io_buffer_int(buf, &pos, sz, p->blksz);
518 	io_buffer_int(buf, &pos, sz, p->len);
519 	io_buffer_int(buf, &pos, sz, p->csum);
520 	io_buffer_int(buf, &pos, sz, p->rem);
521 
522 	for (i = 0; i < p->blksz; i++) {
523 		io_buffer_int(buf, &pos, sz, p->blks[i].chksum_short);
524 		io_buffer_buf(buf, &pos, sz, p->blks[i].chksum_long, p->csum);
525 	}
526 
527 	assert(pos == sz);
528 
529 	if (!io_write_buf(sess, fd, buf, sz)) {
530 		ERRX1("io_write_buf");
531 		goto out;
532 	}
533 
534 	LOG3("%s: sent block prologue: %zu blocks of %zu B, "
535 	    "%zu B remainder, %zu B checksum",
536 	    path, p->blksz, p->len, p->rem, p->csum);
537 	rc = 1;
538 out:
539 	free(buf);
540 	return rc;
541 }
542