1 /* $OpenBSD: blocks.c,v 1.27 2024/09/27 13:13:14 tb Exp $ */
2 /*
3 * Copyright (c) 2019 Kristaps Dzonsons <kristaps@bsd.lv>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17 #include <sys/queue.h>
18 #include <sys/stat.h>
19
20 #include <assert.h>
21 #include <endian.h>
22 #include <errno.h>
23 #include <inttypes.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <unistd.h>
28
29 #include <openssl/md4.h>
30
31 #include "extern.h"
32
33 struct blkhash {
34 const struct blk *blk;
35 TAILQ_ENTRY(blkhash) entries;
36 };
37
38 TAILQ_HEAD(blkhashq, blkhash);
39
40 /*
41 * Table used by the sender for looking up blocks.
42 * The blocks are those sent by the receiver; we're looking up using
43 * hashes computed from our local file.
44 */
45 struct blktab {
46 struct blkhashq *q; /* entries in the hashtable */
47 size_t qsz; /* size of the hashtable */
48 struct blkhash *blks; /* pre-allocated hashtable entries */
49 };
50
51 /*
52 * This is the number of buckets in the hashtable.
53 * Use the same that GPL rsync uses.
54 * (It should be dynamic?)
55 */
56 #define BLKTAB_SZ 65536
57
58 /*
59 * Initialise an empty hashtable with BLKTAB_SZ entries in it.
60 * Populate it for each file with blkhash_set.
61 * When we've processed all files, call blkhash_free.
62 * Returns NULL on allocation failure.
63 */
64 struct blktab *
blkhash_alloc(void)65 blkhash_alloc(void)
66 {
67 struct blktab *p;
68
69 if ((p = calloc(1, sizeof(struct blktab))) == NULL) {
70 ERR("calloc");
71 return NULL;
72 }
73 p->qsz = BLKTAB_SZ;
74 p->q = calloc(p->qsz, sizeof(struct blkhashq));
75 if (p->q == NULL) {
76 ERR("calloc");
77 free(p);
78 return NULL;
79 }
80 return p;
81 }
82
83 /*
84 * Populate the hashtable with an incoming file's blocks.
85 * This will clear out any existing hashed data.
86 * Returns zero on allocation failure, non-zero otherwise.
87 */
88 int
blkhash_set(struct blktab * p,const struct blkset * bset)89 blkhash_set(struct blktab *p, const struct blkset *bset)
90 {
91 struct blkhash *blks;
92 size_t i, idx;
93
94 if (bset == NULL || bset->blksz == 0)
95 return 1;
96
97 /* Wipe clean the table. */
98
99 for (i = 0; i < p->qsz; i++)
100 TAILQ_INIT(&p->q[i]);
101
102 /* Fill in the hashtable. */
103
104 blks = reallocarray(p->blks, bset->blksz, sizeof(struct blkhash));
105 if (blks == NULL) {
106 ERR("reallocarray");
107 free(p->blks);
108 p->blks = NULL;
109 return 0;
110 }
111 p->blks = blks;
112 for (i = 0; i < bset->blksz; i++) {
113 p->blks[i].blk = &bset->blks[i];
114 idx = bset->blks[i].chksum_short % p->qsz;
115 assert(idx < p->qsz);
116 TAILQ_INSERT_TAIL(&p->q[idx], &p->blks[i], entries);
117 }
118
119 return 1;
120 }
121
122 /*
123 * Free as allocated with blkhash_alloc().
124 */
125 void
blkhash_free(struct blktab * p)126 blkhash_free(struct blktab *p)
127 {
128 if (p == NULL)
129 return;
130 free(p->q);
131 free(p->blks);
132 free(p);
133 }
134
135 /*
136 * From our current position of "offs" in buffer "buf" of total size
137 * "size", see if we can find a matching block in our list of blocks.
138 * The "hint" refers to the block that *might* work.
139 * Returns the blk or NULL if no matching block was found.
140 */
141 static const struct blk *
blk_find(struct sess * sess,struct blkstat * st,const struct blkset * blks,const char * path,int recomp)142 blk_find(struct sess *sess, struct blkstat *st,
143 const struct blkset *blks, const char *path, int recomp)
144 {
145 unsigned char md[MD4_DIGEST_LENGTH];
146 uint32_t fhash;
147 off_t remain, osz;
148 int have_md = 0;
149 char *map;
150 const struct blkhashq *q;
151 const struct blkhash *ent;
152
153 remain = st->mapsz - st->offs;
154 assert(remain);
155 osz = MINIMUM(remain, (off_t)blks->len);
156
157 /*
158 * First, compute our fast hash the hard way (if we're
159 * reentering this function from a previous block match, or the
160 * first time) or from our existing s1 and s2 values.
161 */
162
163 if (!recomp) {
164 fhash = (st->s1 & 0xFFFF) | (st->s2 << 16);
165 } else {
166 fhash = hash_fast(st->map + st->offs, (size_t)osz);
167 st->s1 = fhash & 0xFFFF;
168 st->s2 = fhash >> 16;
169 }
170
171 /*
172 * Start with our match hint.
173 * This just runs the fast and slow check with the hint.
174 */
175
176 if (st->hint < blks->blksz &&
177 fhash == blks->blks[st->hint].chksum_short &&
178 (size_t)osz == blks->blks[st->hint].len) {
179 hash_slow(st->map + st->offs, (size_t)osz, md, sess);
180 have_md = 1;
181 if (memcmp(md, blks->blks[st->hint].chksum_long, blks->csum) == 0) {
182 LOG4("%s: found matching hinted match: "
183 "position %jd, block %zu (position %jd, size %zu)",
184 path,
185 (intmax_t)st->offs, blks->blks[st->hint].idx,
186 (intmax_t)blks->blks[st->hint].offs,
187 blks->blks[st->hint].len);
188 return &blks->blks[st->hint];
189 }
190 }
191
192 /*
193 * Look for the fast hash modulus in our hashtable, filter for
194 * those matching the full hash and length, then move to the
195 * slow hash.
196 * The slow hash is computed only once.
197 */
198
199 q = &st->blktab->q[fhash % st->blktab->qsz];
200
201 TAILQ_FOREACH(ent, q, entries) {
202 if (fhash != ent->blk->chksum_short ||
203 (size_t)osz != ent->blk->len)
204 continue;
205
206 LOG4("%s: found matching fast match: "
207 "position %jd, block %zu (position %jd, size %zu)",
208 path, (intmax_t)st->offs, ent->blk->idx,
209 (intmax_t)ent->blk->offs, ent->blk->len);
210
211 if (have_md == 0) {
212 hash_slow(st->map + st->offs, (size_t)osz, md, sess);
213 have_md = 1;
214 }
215
216 if (memcmp(md, ent->blk->chksum_long, blks->csum))
217 continue;
218
219 LOG4("%s: sender verifies slow match", path);
220 return ent->blk;
221 }
222
223 /*
224 * Adjust our partial sums for the hashing.
225 * We first remove the first byte from the sum.
226 * We then, if we have space, add the first byte of the next
227 * block in the sequence.
228 */
229
230 map = st->map + st->offs;
231 st->s1 -= map[0];
232 st->s2 -= osz * map[0];
233
234 if (osz < remain) {
235 st->s1 += map[osz];
236 st->s2 += st->s1;
237 }
238
239 return NULL;
240 }
241
242 /*
243 * Given a local file "path" and the blocks created by a remote machine,
244 * find out which blocks of our file they don't have and send them.
245 * This function is reentrant: it must be called while there's still
246 * data to send.
247 */
248 void
blk_match(struct sess * sess,const struct blkset * blks,const char * path,struct blkstat * st)249 blk_match(struct sess *sess, const struct blkset *blks,
250 const char *path, struct blkstat *st)
251 {
252 off_t last, end = 0, sz;
253 int32_t tok;
254 size_t i;
255 const struct blk *blk;
256
257 /*
258 * If the file's empty or we don't have any blocks from the
259 * sender, then simply send the whole file.
260 * Otherwise, run the hash matching routine and send raw chunks
261 * and subsequent matching tokens.
262 */
263
264 if (st->mapsz && blks->blksz) {
265 /*
266 * Stop searching at the length of the file minus the
267 * size of the last block.
268 * The reason for this being that we don't need to do an
269 * incremental hash within the last block---if it
270 * doesn't match, it doesn't match.
271 */
272
273 end = st->mapsz + 1 - blks->blks[blks->blksz - 1].len;
274 }
275
276 last = st->offs;
277 for (i = 0; st->offs < end; st->offs++, i++) {
278 blk = blk_find(sess, st, blks, path, i == 0);
279 if (blk == NULL)
280 continue;
281
282 sz = st->offs - last;
283 st->dirty += sz;
284 st->total += sz;
285 LOG4("%s: flushing %jd B before %zu B block %zu",
286 path, (intmax_t)sz,
287 blk->len, blk->idx);
288 tok = -(blk->idx + 1);
289
290 hash_file_buf(&st->ctx, st->map + last, sz + blk->len);
291
292 /*
293 * Write the data we have, then follow it with
294 * the tag of the block that matches.
295 */
296
297 st->curpos = last;
298 st->curlen = st->curpos + sz;
299 st->curtok = tok;
300 assert(st->curtok != 0);
301 st->curst = sz ? BLKSTAT_DATA : BLKSTAT_TOK;
302 st->total += blk->len;
303 st->offs += blk->len;
304 st->hint = blk->idx + 1;
305
306 return;
307 }
308
309 /* Emit remaining data and send terminator token. */
310
311 sz = st->mapsz - last;
312 LOG4("%s: flushing %s %jd B", path,
313 last == 0 ? "whole" : "remaining", (intmax_t)sz);
314
315 hash_file_buf(&st->ctx, st->map + last, sz);
316
317 st->total += sz;
318 st->dirty += sz;
319 st->curpos = last;
320 st->curlen = st->curpos + sz;
321 st->curtok = 0;
322 st->curst = sz ? BLKSTAT_DATA : BLKSTAT_TOK;
323 }
324
325 /*
326 * Buffer the message from sender to the receiver indicating that the
327 * block set has been received.
328 * Symmetrises blk_send_ack().
329 */
330 void
blk_recv_ack(char buf[20],const struct blkset * blocks,int32_t idx)331 blk_recv_ack(char buf[20], const struct blkset *blocks, int32_t idx)
332 {
333 size_t pos = 0, sz;
334
335 sz = sizeof(int32_t) + /* index */
336 sizeof(int32_t) + /* block count */
337 sizeof(int32_t) + /* block length */
338 sizeof(int32_t) + /* checksum length */
339 sizeof(int32_t); /* block remainder */
340 assert(sz == 20);
341
342 io_buffer_int(buf, &pos, sz, idx);
343 io_buffer_int(buf, &pos, sz, blocks->blksz);
344 io_buffer_int(buf, &pos, sz, blocks->len);
345 io_buffer_int(buf, &pos, sz, blocks->csum);
346 io_buffer_int(buf, &pos, sz, blocks->rem);
347 assert(pos == sz);
348 }
349
350 /*
351 * Read all of the checksums for a file's blocks.
352 * Returns the set of blocks or NULL on failure.
353 */
354 struct blkset *
blk_recv(struct sess * sess,int fd,const char * path)355 blk_recv(struct sess *sess, int fd, const char *path)
356 {
357 struct blkset *s;
358 int32_t i;
359 size_t j;
360 struct blk *b;
361 off_t offs = 0;
362
363 if ((s = calloc(1, sizeof(struct blkset))) == NULL) {
364 ERR("calloc");
365 return NULL;
366 }
367
368 /*
369 * The block prologue consists of a few values that we'll need
370 * in reading the individual blocks for this file.
371 * FIXME: read into buffer and unbuffer.
372 */
373
374 if (!io_read_size(sess, fd, &s->blksz)) {
375 ERRX1("io_read_size");
376 goto out;
377 } else if (!io_read_size(sess, fd, &s->len)) {
378 ERRX1("io_read_size");
379 goto out;
380 } else if (!io_read_size(sess, fd, &s->csum)) {
381 ERRX1("io_read_int");
382 goto out;
383 } else if (!io_read_size(sess, fd, &s->rem)) {
384 ERRX1("io_read_int");
385 goto out;
386 } else if (s->rem && s->rem >= s->len) {
387 ERRX("block remainder is "
388 "greater than block size");
389 goto out;
390 }
391
392 LOG3("%s: read block prologue: %zu blocks of "
393 "%zu B, %zu B remainder, %zu B checksum", path,
394 s->blksz, s->len, s->rem, s->csum);
395
396 if (s->blksz) {
397 s->blks = calloc(s->blksz, sizeof(struct blk));
398 if (s->blks == NULL) {
399 ERR("calloc");
400 goto out;
401 }
402 }
403
404 /*
405 * Read each block individually.
406 * FIXME: read buffer and unbuffer.
407 */
408
409 for (j = 0; j < s->blksz; j++) {
410 b = &s->blks[j];
411 if (!io_read_int(sess, fd, &i)) {
412 ERRX1("io_read_int");
413 goto out;
414 }
415 b->chksum_short = i;
416
417 assert(s->csum <= sizeof(b->chksum_long));
418 if (!io_read_buf(sess,
419 fd, b->chksum_long, s->csum)) {
420 ERRX1("io_read_buf");
421 goto out;
422 }
423
424 /*
425 * If we're the last block, then we're assigned the
426 * remainder of the data.
427 */
428
429 b->offs = offs;
430 b->idx = j;
431 b->len = (j == (s->blksz - 1) && s->rem) ?
432 s->rem : s->len;
433 offs += b->len;
434
435 LOG4("%s: read block %zu, length %zu B",
436 path, b->idx, b->len);
437 }
438
439 s->size = offs;
440 LOG3("%s: read blocks: %zu blocks, %jd B total blocked data",
441 path, s->blksz, (intmax_t)s->size);
442 return s;
443 out:
444 free(s->blks);
445 free(s);
446 return NULL;
447 }
448
449 /*
450 * Symmetrise blk_recv_ack(), except w/o the leading identifier.
451 * Return zero on failure, non-zero on success.
452 */
453 int
blk_send_ack(struct sess * sess,int fd,struct blkset * p)454 blk_send_ack(struct sess *sess, int fd, struct blkset *p)
455 {
456 char buf[16];
457 size_t pos = 0, sz;
458
459 /* Put the entire send routine into a buffer. */
460
461 sz = sizeof(int32_t) + /* block count */
462 sizeof(int32_t) + /* block length */
463 sizeof(int32_t) + /* checksum length */
464 sizeof(int32_t); /* block remainder */
465 assert(sz <= sizeof(buf));
466
467 if (!io_read_buf(sess, fd, buf, sz)) {
468 ERRX1("io_read_buf");
469 return 0;
470 }
471
472 if (!io_unbuffer_size(buf, &pos, sz, &p->blksz))
473 ERRX1("io_unbuffer_size");
474 else if (!io_unbuffer_size(buf, &pos, sz, &p->len))
475 ERRX1("io_unbuffer_size");
476 else if (!io_unbuffer_size(buf, &pos, sz, &p->csum))
477 ERRX1("io_unbuffer_size");
478 else if (!io_unbuffer_size(buf, &pos, sz, &p->rem))
479 ERRX1("io_unbuffer_size");
480 else if (p->len && p->rem >= p->len)
481 ERRX1("non-zero length is less than remainder");
482 else if (p->csum == 0 || p->csum > 16)
483 ERRX1("inappropriate checksum length");
484 else
485 return 1;
486
487 return 0;
488 }
489
490 /*
491 * Transmit the metadata for set and blocks.
492 * Return zero on failure, non-zero on success.
493 */
494 int
blk_send(struct sess * sess,int fd,size_t idx,const struct blkset * p,const char * path)495 blk_send(struct sess *sess, int fd, size_t idx,
496 const struct blkset *p, const char *path)
497 {
498 char *buf;
499 size_t i, pos = 0, sz;
500 int rc = 0;
501
502 /* Put the entire send routine into a buffer. */
503
504 sz = sizeof(int32_t) + /* identifier */
505 sizeof(int32_t) + /* block count */
506 sizeof(int32_t) + /* block length */
507 sizeof(int32_t) + /* checksum length */
508 sizeof(int32_t) + /* block remainder */
509 p->blksz *
510 (sizeof(int32_t) + /* short checksum */
511 p->csum); /* long checksum */
512
513 if ((buf = malloc(sz)) == NULL) {
514 ERR("malloc");
515 return 0;
516 }
517
518 io_buffer_int(buf, &pos, sz, idx);
519 io_buffer_int(buf, &pos, sz, p->blksz);
520 io_buffer_int(buf, &pos, sz, p->len);
521 io_buffer_int(buf, &pos, sz, p->csum);
522 io_buffer_int(buf, &pos, sz, p->rem);
523
524 for (i = 0; i < p->blksz; i++) {
525 io_buffer_int(buf, &pos, sz, p->blks[i].chksum_short);
526 io_buffer_buf(buf, &pos, sz, p->blks[i].chksum_long, p->csum);
527 }
528
529 assert(pos == sz);
530
531 if (!io_write_buf(sess, fd, buf, sz)) {
532 ERRX1("io_write_buf");
533 goto out;
534 }
535
536 LOG3("%s: sent block prologue: %zu blocks of %zu B, "
537 "%zu B remainder, %zu B checksum",
538 path, p->blksz, p->len, p->rem, p->csum);
539 rc = 1;
540 out:
541 free(buf);
542 return rc;
543 }
544