1 /*= -*- c-basic-offset: 4; indent-tabs-mode: nil; -*-
2  *
3  * librsync -- library for network deltas
4  *
5  * Copyright (C) 1999, 2000, 2001 by Martin Pool <mbp@sourcefrog.net>
6  * Copyright (C) 1999 by Andrew Tridgell
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as published by
10  * the Free Software Foundation; either version 2.1 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  */
22 
23 #include "config.h"
24 #include <assert.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include "librsync.h"
28 #include "sumset.h"
29 #include "trace.h"
30 #include "util.h"
31 
rs_block_sig_init(rs_block_sig_t * sig,rs_weak_sum_t weak_sum,rs_strong_sum_t * strong_sum,int strong_len)32 static void rs_block_sig_init(rs_block_sig_t *sig, rs_weak_sum_t weak_sum,
33                               rs_strong_sum_t *strong_sum, int strong_len)
34 {
35     sig->weak_sum = weak_sum;
36     if (strong_sum)
37         memcpy(sig->strong_sum, strong_sum, (size_t)strong_len);
38 }
39 
rs_block_sig_hash(const rs_block_sig_t * sig)40 static inline unsigned rs_block_sig_hash(const rs_block_sig_t *sig)
41 {
42     return (unsigned)sig->weak_sum;
43 }
44 
45 typedef struct rs_block_match {
46     rs_block_sig_t block_sig;
47     rs_signature_t *signature;
48     const void *buf;
49     size_t len;
50 } rs_block_match_t;
51 
rs_block_match_init(rs_block_match_t * match,rs_signature_t * sig,rs_weak_sum_t weak_sum,rs_strong_sum_t * strong_sum,const void * buf,size_t len)52 static void rs_block_match_init(rs_block_match_t *match, rs_signature_t *sig,
53                                 rs_weak_sum_t weak_sum,
54                                 rs_strong_sum_t *strong_sum, const void *buf,
55                                 size_t len)
56 {
57     rs_block_sig_init(&match->block_sig, weak_sum, strong_sum,
58                       sig->strong_sum_len);
59     match->signature = sig;
60     match->buf = buf;
61     match->len = len;
62 }
63 
rs_block_match_cmp(rs_block_match_t * match,const rs_block_sig_t * block_sig)64 static inline int rs_block_match_cmp(rs_block_match_t *match,
65                                      const rs_block_sig_t *block_sig)
66 {
67     /* If buf is not NULL, the strong sum is yet to be calculated. */
68     if (match->buf) {
69 #ifndef HASHTABLE_NSTATS
70         match->signature->calc_strong_count++;
71 #endif
72         rs_signature_calc_strong_sum(match->signature, match->buf, match->len,
73                                      &(match->block_sig.strong_sum));
74         match->buf = NULL;
75     }
76     return memcmp(&match->block_sig.strong_sum, &block_sig->strong_sum,
77                   (size_t)match->signature->strong_sum_len);
78 }
79 
80 /* Disable mix32() in the hashtable because RabinKarp doesn't need it. We
81    manually apply mix32() to rollsums before using them in the hashtable. */
82 #define HASHTABLE_NMIX32
83 /* Instantiate hashtable for rs_block_sig and rs_block_match. */
84 #define ENTRY rs_block_sig
85 #define MATCH rs_block_match
86 #define NAME hashtable
87 #include "hashtable.h"
88 
89 /* Get the size of a packed rs_block_sig_t. */
rs_block_sig_size(const rs_signature_t * sig)90 static inline size_t rs_block_sig_size(const rs_signature_t *sig)
91 {
92     /* Round up to multiple of sizeof(weak_sum) to align memory correctly. */
93     const size_t mask = sizeof(rs_weak_sum_t)- 1;
94     return (offsetof(rs_block_sig_t, strong_sum) +
95             (((size_t)sig->strong_sum_len + mask) & ~mask));
96 }
97 
98 /* Get the pointer to the block_sig_t from a block index. */
rs_block_sig_ptr(const rs_signature_t * sig,int block_idx)99 static inline rs_block_sig_t *rs_block_sig_ptr(const rs_signature_t *sig,
100                                                int block_idx)
101 {
102     return (rs_block_sig_t *)((char *)sig->block_sigs +
103                                block_idx * rs_block_sig_size(sig));
104 }
105 
106 /* Get the index of a block from a block_sig_t pointer. */
rs_block_sig_idx(const rs_signature_t * sig,rs_block_sig_t * block_sig)107 static inline int rs_block_sig_idx(const rs_signature_t *sig,
108                                    rs_block_sig_t *block_sig)
109 {
110     return (int)(((char *)block_sig -
111                   (char *)sig->block_sigs) / rs_block_sig_size(sig));
112 }
113 
rs_sig_args(rs_long_t old_fsize,rs_magic_number * magic,size_t * block_len,size_t * strong_len)114 rs_result rs_sig_args(rs_long_t old_fsize, rs_magic_number * magic,
115                       size_t *block_len, size_t *strong_len)
116 {
117     size_t rec_block_len;       /* the recomended block_len for the given
118                                    old_fsize. */
119     size_t min_strong_len;      /* the minimum strong_len for the given
120                                    old_fsize and block_len. */
121     size_t max_strong_len;      /* the maximum strong_len for the given magic. */
122 
123     /* Check and set default arguments. */
124     *magic = *magic ? *magic : RS_RK_BLAKE2_SIG_MAGIC;
125     switch (*magic) {
126     case RS_BLAKE2_SIG_MAGIC:
127     case RS_RK_BLAKE2_SIG_MAGIC:
128         max_strong_len = RS_BLAKE2_SUM_LENGTH;
129         break;
130     case RS_MD4_SIG_MAGIC:
131     case RS_RK_MD4_SIG_MAGIC:
132         max_strong_len = RS_MD4_SUM_LENGTH;
133         break;
134     default:
135         rs_error("invalid magic %#x", *magic);
136         return RS_BAD_MAGIC;
137     }
138     /* The recommended block_len is sqrt(old_fsize) with a 256 min size rounded
139        down to a multiple of the 128 byte blake2b blocksize to give a
140        reasonable compromise between signature size, delta size, and
141        performance. If the old_fsize is unknown, we use the default. */
142     if (old_fsize < 0) {
143         rec_block_len = RS_DEFAULT_BLOCK_LEN;
144     } else {
145         rec_block_len =
146             old_fsize <= 256 * 256 ? 256 : rs_long_sqrt(old_fsize) & ~127;
147     }
148     if (*block_len == 0)
149         *block_len = rec_block_len;
150     /* The recommended strong_len assumes the worst case new_fsize = old_fsize
151        + 16MB with no matches. This results in comparing a block at every byte
152        offset against all the blocks in the signature, or new_fsize*block_num
153        comparisons. With N bits in the blocksig, there is a 1/2^N chance per
154        comparison of a hash colision. So with 2^N attempts there would be a
155        fair chance of having a collision. So we want to round up to the next
156        byte, add an extra 2 bytes (16 bits) in the strongsum, and assume the
157        weaksum is worth another 16 bits, for at least 32 bits extra, giving a
158        worst case 1/2^32 chance of having a hash collision per delta. If
159        old_fsize is unknown we use a conservative default. */
160     if (old_fsize < 0) {
161         min_strong_len = RS_DEFAULT_MIN_STRONG_LEN;
162     } else {
163         min_strong_len =
164             2 + (rs_long_ln2(old_fsize + ((rs_long_t)1 << 24)) +
165                  rs_long_ln2(old_fsize / *block_len + 1) + 7) / 8;
166     }
167     if (*strong_len == 0)
168         *strong_len = max_strong_len;
169     else if (*strong_len == -1)
170         *strong_len = min_strong_len;
171     else if (old_fsize >= 0 && *strong_len < min_strong_len) {
172         rs_warn("strong_len=" FMT_SIZE " smaller than recommended minimum "
173                 FMT_SIZE " for old_fsize=" FMT_LONG " with block_len=" FMT_SIZE,
174                 *strong_len, min_strong_len, old_fsize, *block_len);
175     } else if (*strong_len > max_strong_len) {
176         rs_error("invalid strong_len=" FMT_SIZE " for magic=%#x", *strong_len,
177                  (int)*magic);
178         return RS_PARAM_ERROR;
179     }
180     rs_sig_args_check(*magic, *block_len, *strong_len);
181     return RS_DONE;
182 }
183 
rs_signature_init(rs_signature_t * sig,rs_magic_number magic,size_t block_len,size_t strong_len,rs_long_t sig_fsize)184 rs_result rs_signature_init(rs_signature_t *sig, rs_magic_number magic,
185                             size_t block_len, size_t strong_len,
186                             rs_long_t sig_fsize)
187 {
188     rs_result result;
189 
190     /* Check and set default arguments, using old_fsize=-1 for unknown. */
191     if ((result = rs_sig_args(-1, &magic, &block_len, &strong_len)) != RS_DONE)
192         return result;
193     /* Set attributes from args. */
194     sig->magic = magic;
195     sig->block_len = (int)block_len;
196     sig->strong_sum_len = (int)strong_len;
197     sig->count = 0;
198     /* Calculate the number of blocks if we have the signature file size. */
199     /* Magic+header is 12 bytes, each block thereafter is 4 bytes
200        weak_sum+strong_sum_len bytes */
201     sig->size = (int)(sig_fsize < 12 ? 0 : (sig_fsize - 12) / (4 + strong_len));
202     if (sig->size)
203         sig->block_sigs =
204             rs_alloc(sig->size * rs_block_sig_size(sig),
205                      "signature->block_sigs");
206     else
207         sig->block_sigs = NULL;
208     sig->hashtable = NULL;
209 #ifndef HASHTABLE_NSTATS
210     sig->calc_strong_count = 0;
211 #endif
212     rs_signature_check(sig);
213     return RS_DONE;
214 }
215 
rs_signature_done(rs_signature_t * sig)216 void rs_signature_done(rs_signature_t *sig)
217 {
218     hashtable_free(sig->hashtable);
219     free(sig->block_sigs);
220     rs_bzero(sig, sizeof(*sig));
221 }
222 
rs_signature_add_block(rs_signature_t * sig,rs_weak_sum_t weak_sum,rs_strong_sum_t * strong_sum)223 rs_block_sig_t *rs_signature_add_block(rs_signature_t *sig,
224                                        rs_weak_sum_t weak_sum,
225                                        rs_strong_sum_t *strong_sum)
226 {
227     rs_signature_check(sig);
228     /* Apply mix32() to rollsum weaksums to improve their distribution. */
229     if (rs_signature_weaksum_kind(sig) == RS_ROLLSUM)
230         weak_sum = mix32(weak_sum);
231     /* If block_sigs is full, allocate more space. */
232     if (sig->count == sig->size) {
233         sig->size = sig->size ? sig->size * 2 : 16;
234         sig->block_sigs =
235             rs_realloc(sig->block_sigs, sig->size * rs_block_sig_size(sig),
236                        "signature->block_sigs");
237     }
238     rs_block_sig_t *b = rs_block_sig_ptr(sig, sig->count++);
239     rs_block_sig_init(b, weak_sum, strong_sum, sig->strong_sum_len);
240     return b;
241 }
242 
rs_signature_find_match(rs_signature_t * sig,rs_weak_sum_t weak_sum,void const * buf,size_t len)243 rs_long_t rs_signature_find_match(rs_signature_t *sig, rs_weak_sum_t weak_sum,
244                                   void const *buf, size_t len)
245 {
246     rs_block_match_t m;
247     rs_block_sig_t *b;
248 
249     rs_signature_check(sig);
250     rs_block_match_init(&m, sig, weak_sum, NULL, buf, len);
251     if ((b = hashtable_find(sig->hashtable, &m))) {
252         return (rs_long_t)rs_block_sig_idx(sig, b) * sig->block_len;
253     }
254     return -1;
255 }
256 
rs_signature_log_stats(rs_signature_t const * sig)257 void rs_signature_log_stats(rs_signature_t const *sig)
258 {
259 #ifndef HASHTABLE_NSTATS
260     hashtable_t *t = sig->hashtable;
261 
262     rs_log(RS_LOG_INFO | RS_LOG_NONAME,
263            "match statistics: signature[%ld searches, %ld (%.3f%%) matches, "
264            "%ld (%.3fx) weak sum compares, %ld (%.3f%%) strong sum compares, "
265            "%ld (%.3f%%) strong sum calcs]", t->find_count, t->match_count,
266            100.0 * (double)t->match_count / (double)t->find_count,
267            t->hashcmp_count, (double)t->hashcmp_count / (double)t->find_count,
268            t->entrycmp_count,
269            100.0 * (double)t->entrycmp_count / (double)t->find_count,
270            sig->calc_strong_count,
271            100.0 * (double)sig->calc_strong_count / (double)t->find_count);
272 #endif
273 }
274 
rs_build_hash_table(rs_signature_t * sig)275 rs_result rs_build_hash_table(rs_signature_t *sig)
276 {
277     rs_block_match_t m;
278     rs_block_sig_t *b;
279     int i;
280 
281     rs_signature_check(sig);
282     sig->hashtable = hashtable_new(sig->count);
283     if (!sig->hashtable)
284         return RS_MEM_ERROR;
285     for (i = 0; i < sig->count; i++) {
286         b = rs_block_sig_ptr(sig, i);
287         rs_block_match_init(&m, sig, b->weak_sum, &b->strong_sum, NULL, 0);
288         if (!hashtable_find(sig->hashtable, &m))
289             hashtable_add(sig->hashtable, b);
290     }
291     hashtable_stats_init(sig->hashtable);
292     return RS_DONE;
293 }
294 
rs_free_sumset(rs_signature_t * psums)295 void rs_free_sumset(rs_signature_t *psums)
296 {
297     rs_signature_done(psums);
298     free(psums);
299 }
300 
rs_sumset_dump(rs_signature_t const * sums)301 void rs_sumset_dump(rs_signature_t const *sums)
302 {
303     int i;
304     rs_block_sig_t *b;
305     char strong_hex[RS_MAX_STRONG_SUM_LENGTH * 3];
306 
307     rs_log(RS_LOG_INFO | RS_LOG_NONAME,
308            "sumset info: magic=%#x, block_len=%d, block_num=%d", sums->magic,
309            sums->block_len, sums->count);
310 
311     for (i = 0; i < sums->count; i++) {
312         b = rs_block_sig_ptr(sums, i);
313         rs_hexify(strong_hex, b->strong_sum, sums->strong_sum_len);
314         rs_log(RS_LOG_INFO | RS_LOG_NONAME,
315                "sum %6d: weak=" FMT_WEAKSUM ", strong=%s", i, b->weak_sum,
316                strong_hex);
317     }
318 }
319