xref: /openbsd/usr.bin/rsync/blocks.c (revision 7eb67ff9)
1 /*	$OpenBSD: blocks.c,v 1.27 2024/09/27 13:13:14 tb Exp $ */
2 /*
3  * Copyright (c) 2019 Kristaps Dzonsons <kristaps@bsd.lv>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 #include <sys/queue.h>
18 #include <sys/stat.h>
19 
20 #include <assert.h>
21 #include <endian.h>
22 #include <errno.h>
23 #include <inttypes.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <unistd.h>
28 
29 #include <openssl/md4.h>
30 
31 #include "extern.h"
32 
33 struct	blkhash {
34 	const struct blk	*blk;
35 	TAILQ_ENTRY(blkhash)	 entries;
36 };
37 
38 TAILQ_HEAD(blkhashq, blkhash);
39 
40 /*
41  * Table used by the sender for looking up blocks.
42  * The blocks are those sent by the receiver; we're looking up using
43  * hashes computed from our local file.
44  */
45 struct	blktab {
46 	struct blkhashq	*q; /* entries in the hashtable */
47 	size_t		 qsz; /* size of the hashtable */
48 	struct blkhash	*blks; /* pre-allocated hashtable entries */
49 };
50 
51 /*
52  * This is the number of buckets in the hashtable.
53  * Use the same that GPL rsync uses.
54  * (It should be dynamic?)
55  */
56 #define	BLKTAB_SZ	  65536
57 
58 /*
59  * Initialise an empty hashtable with BLKTAB_SZ entries in it.
60  * Populate it for each file with blkhash_set.
61  * When we've processed all files, call blkhash_free.
62  * Returns NULL on allocation failure.
63  */
64 struct blktab *
blkhash_alloc(void)65 blkhash_alloc(void)
66 {
67 	struct blktab	*p;
68 
69 	if ((p = calloc(1, sizeof(struct blktab))) == NULL) {
70 		ERR("calloc");
71 		return NULL;
72 	}
73 	p->qsz = BLKTAB_SZ;
74 	p->q = calloc(p->qsz, sizeof(struct blkhashq));
75 	if (p->q == NULL) {
76 		ERR("calloc");
77 		free(p);
78 		return NULL;
79 	}
80 	return p;
81 }
82 
83 /*
84  * Populate the hashtable with an incoming file's blocks.
85  * This will clear out any existing hashed data.
86  * Returns zero on allocation failure, non-zero otherwise.
87  */
88 int
blkhash_set(struct blktab * p,const struct blkset * bset)89 blkhash_set(struct blktab *p, const struct blkset *bset)
90 {
91 	struct blkhash	*blks;
92 	size_t		 i, idx;
93 
94 	if (bset == NULL || bset->blksz == 0)
95 		return 1;
96 
97 	/* Wipe clean the table. */
98 
99 	for (i = 0; i < p->qsz; i++)
100 		TAILQ_INIT(&p->q[i]);
101 
102 	/* Fill in the hashtable. */
103 
104 	blks = reallocarray(p->blks, bset->blksz, sizeof(struct blkhash));
105 	if (blks == NULL) {
106 		ERR("reallocarray");
107 		free(p->blks);
108 		p->blks = NULL;
109 		return 0;
110 	}
111 	p->blks = blks;
112 	for (i = 0; i < bset->blksz; i++) {
113 		p->blks[i].blk = &bset->blks[i];
114 		idx = bset->blks[i].chksum_short % p->qsz;
115 		assert(idx < p->qsz);
116 		TAILQ_INSERT_TAIL(&p->q[idx], &p->blks[i], entries);
117 	}
118 
119 	return 1;
120 }
121 
122 /*
123  * Free as allocated with blkhash_alloc().
124  */
125 void
blkhash_free(struct blktab * p)126 blkhash_free(struct blktab *p)
127 {
128 	if (p == NULL)
129 		return;
130 	free(p->q);
131 	free(p->blks);
132 	free(p);
133 }
134 
135 /*
136  * From our current position of "offs" in buffer "buf" of total size
137  * "size", see if we can find a matching block in our list of blocks.
138  * The "hint" refers to the block that *might* work.
139  * Returns the blk or NULL if no matching block was found.
140  */
141 static const struct blk *
blk_find(struct sess * sess,struct blkstat * st,const struct blkset * blks,const char * path,int recomp)142 blk_find(struct sess *sess, struct blkstat *st,
143 	const struct blkset *blks, const char *path, int recomp)
144 {
145 	unsigned char	 md[MD4_DIGEST_LENGTH];
146 	uint32_t	 fhash;
147 	off_t		 remain, osz;
148 	int		 have_md = 0;
149 	char		*map;
150 	const struct blkhashq *q;
151 	const struct blkhash *ent;
152 
153 	remain = st->mapsz - st->offs;
154 	assert(remain);
155 	osz = MINIMUM(remain, (off_t)blks->len);
156 
157 	/*
158 	 * First, compute our fast hash the hard way (if we're
159 	 * reentering this function from a previous block match, or the
160 	 * first time) or from our existing s1 and s2 values.
161 	 */
162 
163 	if (!recomp) {
164 		fhash = (st->s1 & 0xFFFF) | (st->s2 << 16);
165 	} else {
166 		fhash = hash_fast(st->map + st->offs, (size_t)osz);
167 		st->s1 = fhash & 0xFFFF;
168 		st->s2 = fhash >> 16;
169 	}
170 
171 	/*
172 	 * Start with our match hint.
173 	 * This just runs the fast and slow check with the hint.
174 	 */
175 
176 	if (st->hint < blks->blksz &&
177 	    fhash == blks->blks[st->hint].chksum_short &&
178 	    (size_t)osz == blks->blks[st->hint].len) {
179 		hash_slow(st->map + st->offs, (size_t)osz, md, sess);
180 		have_md = 1;
181 		if (memcmp(md, blks->blks[st->hint].chksum_long, blks->csum) == 0) {
182 			LOG4("%s: found matching hinted match: "
183 			    "position %jd, block %zu (position %jd, size %zu)",
184 			    path,
185 			    (intmax_t)st->offs, blks->blks[st->hint].idx,
186 			    (intmax_t)blks->blks[st->hint].offs,
187 			    blks->blks[st->hint].len);
188 			return &blks->blks[st->hint];
189 		}
190 	}
191 
192 	/*
193 	 * Look for the fast hash modulus in our hashtable, filter for
194 	 * those matching the full hash and length, then move to the
195 	 * slow hash.
196 	 * The slow hash is computed only once.
197 	 */
198 
199 	q = &st->blktab->q[fhash % st->blktab->qsz];
200 
201 	TAILQ_FOREACH(ent, q, entries) {
202 		if (fhash != ent->blk->chksum_short ||
203 		    (size_t)osz != ent->blk->len)
204 			continue;
205 
206 		LOG4("%s: found matching fast match: "
207 		    "position %jd, block %zu (position %jd, size %zu)",
208 		    path, (intmax_t)st->offs, ent->blk->idx,
209 		    (intmax_t)ent->blk->offs, ent->blk->len);
210 
211 		if (have_md == 0) {
212 			hash_slow(st->map + st->offs, (size_t)osz, md, sess);
213 			have_md = 1;
214 		}
215 
216 		if (memcmp(md, ent->blk->chksum_long, blks->csum))
217 			continue;
218 
219 		LOG4("%s: sender verifies slow match", path);
220 		return ent->blk;
221 	}
222 
223 	/*
224 	 * Adjust our partial sums for the hashing.
225 	 * We first remove the first byte from the sum.
226 	 * We then, if we have space, add the first byte of the next
227 	 * block in the sequence.
228 	 */
229 
230 	map = st->map + st->offs;
231 	st->s1 -= map[0];
232 	st->s2 -= osz * map[0];
233 
234 	if (osz < remain) {
235 		st->s1 += map[osz];
236 		st->s2 += st->s1;
237 	}
238 
239 	return NULL;
240 }
241 
242 /*
243  * Given a local file "path" and the blocks created by a remote machine,
244  * find out which blocks of our file they don't have and send them.
245  * This function is reentrant: it must be called while there's still
246  * data to send.
247  */
248 void
blk_match(struct sess * sess,const struct blkset * blks,const char * path,struct blkstat * st)249 blk_match(struct sess *sess, const struct blkset *blks,
250 	const char *path, struct blkstat *st)
251 {
252 	off_t		  last, end = 0, sz;
253 	int32_t		  tok;
254 	size_t		  i;
255 	const struct blk *blk;
256 
257 	/*
258 	 * If the file's empty or we don't have any blocks from the
259 	 * sender, then simply send the whole file.
260 	 * Otherwise, run the hash matching routine and send raw chunks
261 	 * and subsequent matching tokens.
262 	 */
263 
264 	if (st->mapsz && blks->blksz) {
265 		/*
266 		 * Stop searching at the length of the file minus the
267 		 * size of the last block.
268 		 * The reason for this being that we don't need to do an
269 		 * incremental hash within the last block---if it
270 		 * doesn't match, it doesn't match.
271 		 */
272 
273 		end = st->mapsz + 1 - blks->blks[blks->blksz - 1].len;
274 	}
275 
276 	last = st->offs;
277 	for (i = 0; st->offs < end; st->offs++, i++) {
278 		blk = blk_find(sess, st, blks, path, i == 0);
279 		if (blk == NULL)
280 			continue;
281 
282 		sz = st->offs - last;
283 		st->dirty += sz;
284 		st->total += sz;
285 		LOG4("%s: flushing %jd B before %zu B block %zu",
286 		    path, (intmax_t)sz,
287 		    blk->len, blk->idx);
288 		tok = -(blk->idx + 1);
289 
290 		hash_file_buf(&st->ctx, st->map + last, sz + blk->len);
291 
292 		/*
293 		 * Write the data we have, then follow it with
294 		 * the tag of the block that matches.
295 		 */
296 
297 		st->curpos = last;
298 		st->curlen = st->curpos + sz;
299 		st->curtok = tok;
300 		assert(st->curtok != 0);
301 		st->curst = sz ? BLKSTAT_DATA : BLKSTAT_TOK;
302 		st->total += blk->len;
303 		st->offs += blk->len;
304 		st->hint = blk->idx + 1;
305 
306 		return;
307 	}
308 
309 	/* Emit remaining data and send terminator token. */
310 
311 	sz = st->mapsz - last;
312 	LOG4("%s: flushing %s %jd B", path,
313 	    last == 0 ? "whole" : "remaining", (intmax_t)sz);
314 
315 	hash_file_buf(&st->ctx, st->map + last, sz);
316 
317 	st->total += sz;
318 	st->dirty += sz;
319 	st->curpos = last;
320 	st->curlen = st->curpos + sz;
321 	st->curtok = 0;
322 	st->curst = sz ? BLKSTAT_DATA : BLKSTAT_TOK;
323 }
324 
325 /*
326  * Buffer the message from sender to the receiver indicating that the
327  * block set has been received.
328  * Symmetrises blk_send_ack().
329  */
330 void
blk_recv_ack(char buf[20],const struct blkset * blocks,int32_t idx)331 blk_recv_ack(char buf[20], const struct blkset *blocks, int32_t idx)
332 {
333 	size_t	 pos = 0, sz;
334 
335 	sz = sizeof(int32_t) + /* index */
336 	    sizeof(int32_t) + /* block count */
337 	    sizeof(int32_t) + /* block length */
338 	    sizeof(int32_t) + /* checksum length */
339 	    sizeof(int32_t); /* block remainder */
340 	assert(sz == 20);
341 
342 	io_buffer_int(buf, &pos, sz, idx);
343 	io_buffer_int(buf, &pos, sz, blocks->blksz);
344 	io_buffer_int(buf, &pos, sz, blocks->len);
345 	io_buffer_int(buf, &pos, sz, blocks->csum);
346 	io_buffer_int(buf, &pos, sz, blocks->rem);
347 	assert(pos == sz);
348 }
349 
350 /*
351  * Read all of the checksums for a file's blocks.
352  * Returns the set of blocks or NULL on failure.
353  */
354 struct blkset *
blk_recv(struct sess * sess,int fd,const char * path)355 blk_recv(struct sess *sess, int fd, const char *path)
356 {
357 	struct blkset	*s;
358 	int32_t		 i;
359 	size_t		 j;
360 	struct blk	*b;
361 	off_t		 offs = 0;
362 
363 	if ((s = calloc(1, sizeof(struct blkset))) == NULL) {
364 		ERR("calloc");
365 		return NULL;
366 	}
367 
368 	/*
369 	 * The block prologue consists of a few values that we'll need
370 	 * in reading the individual blocks for this file.
371 	 * FIXME: read into buffer and unbuffer.
372 	 */
373 
374 	if (!io_read_size(sess, fd, &s->blksz)) {
375 		ERRX1("io_read_size");
376 		goto out;
377 	} else if (!io_read_size(sess, fd, &s->len)) {
378 		ERRX1("io_read_size");
379 		goto out;
380 	} else if (!io_read_size(sess, fd, &s->csum)) {
381 		ERRX1("io_read_int");
382 		goto out;
383 	} else if (!io_read_size(sess, fd, &s->rem)) {
384 		ERRX1("io_read_int");
385 		goto out;
386 	} else if (s->rem && s->rem >= s->len) {
387 		ERRX("block remainder is "
388 			"greater than block size");
389 		goto out;
390 	}
391 
392 	LOG3("%s: read block prologue: %zu blocks of "
393 	    "%zu B, %zu B remainder, %zu B checksum", path,
394 	    s->blksz, s->len, s->rem, s->csum);
395 
396 	if (s->blksz) {
397 		s->blks = calloc(s->blksz, sizeof(struct blk));
398 		if (s->blks == NULL) {
399 			ERR("calloc");
400 			goto out;
401 		}
402 	}
403 
404 	/*
405 	 * Read each block individually.
406 	 * FIXME: read buffer and unbuffer.
407 	 */
408 
409 	for (j = 0; j < s->blksz; j++) {
410 		b = &s->blks[j];
411 		if (!io_read_int(sess, fd, &i)) {
412 			ERRX1("io_read_int");
413 			goto out;
414 		}
415 		b->chksum_short = i;
416 
417 		assert(s->csum <= sizeof(b->chksum_long));
418 		if (!io_read_buf(sess,
419 		    fd, b->chksum_long, s->csum)) {
420 			ERRX1("io_read_buf");
421 			goto out;
422 		}
423 
424 		/*
425 		 * If we're the last block, then we're assigned the
426 		 * remainder of the data.
427 		 */
428 
429 		b->offs = offs;
430 		b->idx = j;
431 		b->len = (j == (s->blksz - 1) && s->rem) ?
432 			s->rem : s->len;
433 		offs += b->len;
434 
435 		LOG4("%s: read block %zu, length %zu B",
436 		    path, b->idx, b->len);
437 	}
438 
439 	s->size = offs;
440 	LOG3("%s: read blocks: %zu blocks, %jd B total blocked data",
441 	    path, s->blksz, (intmax_t)s->size);
442 	return s;
443 out:
444 	free(s->blks);
445 	free(s);
446 	return NULL;
447 }
448 
449 /*
450  * Symmetrise blk_recv_ack(), except w/o the leading identifier.
451  * Return zero on failure, non-zero on success.
452  */
453 int
blk_send_ack(struct sess * sess,int fd,struct blkset * p)454 blk_send_ack(struct sess *sess, int fd, struct blkset *p)
455 {
456 	char	 buf[16];
457 	size_t	 pos = 0, sz;
458 
459 	/* Put the entire send routine into a buffer. */
460 
461 	sz = sizeof(int32_t) + /* block count */
462 	    sizeof(int32_t) + /* block length */
463 	    sizeof(int32_t) + /* checksum length */
464 	    sizeof(int32_t); /* block remainder */
465 	assert(sz <= sizeof(buf));
466 
467 	if (!io_read_buf(sess, fd, buf, sz)) {
468 		ERRX1("io_read_buf");
469 		return 0;
470 	}
471 
472 	if (!io_unbuffer_size(buf, &pos, sz, &p->blksz))
473 		ERRX1("io_unbuffer_size");
474 	else if (!io_unbuffer_size(buf, &pos, sz, &p->len))
475 		ERRX1("io_unbuffer_size");
476 	else if (!io_unbuffer_size(buf, &pos, sz, &p->csum))
477 		ERRX1("io_unbuffer_size");
478 	else if (!io_unbuffer_size(buf, &pos, sz, &p->rem))
479 		ERRX1("io_unbuffer_size");
480 	else if (p->len && p->rem >= p->len)
481 		ERRX1("non-zero length is less than remainder");
482 	else if (p->csum == 0 || p->csum > 16)
483 		ERRX1("inappropriate checksum length");
484 	else
485 		return 1;
486 
487 	return 0;
488 }
489 
490 /*
491  * Transmit the metadata for set and blocks.
492  * Return zero on failure, non-zero on success.
493  */
494 int
blk_send(struct sess * sess,int fd,size_t idx,const struct blkset * p,const char * path)495 blk_send(struct sess *sess, int fd, size_t idx,
496 	const struct blkset *p, const char *path)
497 {
498 	char	*buf;
499 	size_t	 i, pos = 0, sz;
500 	int	 rc = 0;
501 
502 	/* Put the entire send routine into a buffer. */
503 
504 	sz = sizeof(int32_t) + /* identifier */
505 	    sizeof(int32_t) + /* block count */
506 	    sizeof(int32_t) + /* block length */
507 	    sizeof(int32_t) + /* checksum length */
508 	    sizeof(int32_t) + /* block remainder */
509 	    p->blksz *
510 	    (sizeof(int32_t) + /* short checksum */
511 		p->csum); /* long checksum */
512 
513 	if ((buf = malloc(sz)) == NULL) {
514 		ERR("malloc");
515 		return 0;
516 	}
517 
518 	io_buffer_int(buf, &pos, sz, idx);
519 	io_buffer_int(buf, &pos, sz, p->blksz);
520 	io_buffer_int(buf, &pos, sz, p->len);
521 	io_buffer_int(buf, &pos, sz, p->csum);
522 	io_buffer_int(buf, &pos, sz, p->rem);
523 
524 	for (i = 0; i < p->blksz; i++) {
525 		io_buffer_int(buf, &pos, sz, p->blks[i].chksum_short);
526 		io_buffer_buf(buf, &pos, sz, p->blks[i].chksum_long, p->csum);
527 	}
528 
529 	assert(pos == sz);
530 
531 	if (!io_write_buf(sess, fd, buf, sz)) {
532 		ERRX1("io_write_buf");
533 		goto out;
534 	}
535 
536 	LOG3("%s: sent block prologue: %zu blocks of %zu B, "
537 	    "%zu B remainder, %zu B checksum",
538 	    path, p->blksz, p->len, p->rem, p->csum);
539 	rc = 1;
540 out:
541 	free(buf);
542 	return rc;
543 }
544