1 /*
2 * Copyright (C) the libgit2 contributors. All rights reserved.
3 *
4 * This file is part of libgit2, distributed under the GNU GPL v2 with
5 * a Linking Exception. For full terms see the included COPYING file.
6 */
7
8 #include "common.h"
9
10 #include "git2/sys/hashsig.h"
11 #include "futils.h"
12 #include "util.h"
13
14 typedef uint32_t hashsig_t;
15 typedef uint64_t hashsig_state;
16
17 #define HASHSIG_SCALE 100
18
19 #define HASHSIG_MAX_RUN 80
20 #define HASHSIG_HASH_START 0x012345678ABCDEF0LL
21 #define HASHSIG_HASH_SHIFT 5
22
23 #define HASHSIG_HASH_MIX(S,CH) \
24 (S) = ((S) << HASHSIG_HASH_SHIFT) - (S) + (hashsig_state)(CH)
25
26 #define HASHSIG_HEAP_SIZE ((1 << 7) - 1)
27 #define HASHSIG_HEAP_MIN_SIZE 4
28
29 typedef int (*hashsig_cmp)(const void *a, const void *b, void *);
30
31 typedef struct {
32 int size, asize;
33 hashsig_cmp cmp;
34 hashsig_t values[HASHSIG_HEAP_SIZE];
35 } hashsig_heap;
36
37 struct git_hashsig {
38 hashsig_heap mins;
39 hashsig_heap maxs;
40 size_t lines;
41 git_hashsig_option_t opt;
42 };
43
44 #define HEAP_LCHILD_OF(I) (((I)<<1)+1)
45 #define HEAP_RCHILD_OF(I) (((I)<<1)+2)
46 #define HEAP_PARENT_OF(I) (((I)-1)>>1)
47
hashsig_heap_init(hashsig_heap * h,hashsig_cmp cmp)48 static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp)
49 {
50 h->size = 0;
51 h->asize = HASHSIG_HEAP_SIZE;
52 h->cmp = cmp;
53 }
54
hashsig_cmp_max(const void * a,const void * b,void * payload)55 static int hashsig_cmp_max(const void *a, const void *b, void *payload)
56 {
57 hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
58 GIT_UNUSED(payload);
59 return (av < bv) ? -1 : (av > bv) ? 1 : 0;
60 }
61
hashsig_cmp_min(const void * a,const void * b,void * payload)62 static int hashsig_cmp_min(const void *a, const void *b, void *payload)
63 {
64 hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
65 GIT_UNUSED(payload);
66 return (av > bv) ? -1 : (av < bv) ? 1 : 0;
67 }
68
hashsig_heap_up(hashsig_heap * h,int el)69 static void hashsig_heap_up(hashsig_heap *h, int el)
70 {
71 int parent_el = HEAP_PARENT_OF(el);
72
73 while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el], NULL) > 0) {
74 hashsig_t t = h->values[el];
75 h->values[el] = h->values[parent_el];
76 h->values[parent_el] = t;
77
78 el = parent_el;
79 parent_el = HEAP_PARENT_OF(el);
80 }
81 }
82
hashsig_heap_down(hashsig_heap * h,int el)83 static void hashsig_heap_down(hashsig_heap *h, int el)
84 {
85 hashsig_t v, lv, rv;
86
87 /* 'el < h->size / 2' tests if el is bottom row of heap */
88
89 while (el < h->size / 2) {
90 int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel;
91
92 v = h->values[el];
93 lv = h->values[lel];
94 rv = h->values[rel];
95
96 if (h->cmp(&v, &lv, NULL) < 0 && h->cmp(&v, &rv, NULL) < 0)
97 break;
98
99 swapel = (h->cmp(&lv, &rv, NULL) < 0) ? lel : rel;
100
101 h->values[el] = h->values[swapel];
102 h->values[swapel] = v;
103
104 el = swapel;
105 }
106 }
107
hashsig_heap_sort(hashsig_heap * h)108 static void hashsig_heap_sort(hashsig_heap *h)
109 {
110 /* only need to do this at the end for signature comparison */
111 git__qsort_r(h->values, h->size, sizeof(hashsig_t), h->cmp, NULL);
112 }
113
hashsig_heap_insert(hashsig_heap * h,hashsig_t val)114 static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val)
115 {
116 /* if heap is not full, insert new element */
117 if (h->size < h->asize) {
118 h->values[h->size++] = val;
119 hashsig_heap_up(h, h->size - 1);
120 }
121
122 /* if heap is full, pop top if new element should replace it */
123 else if (h->cmp(&val, &h->values[0], NULL) > 0) {
124 h->size--;
125 h->values[0] = h->values[h->size];
126 hashsig_heap_down(h, 0);
127 }
128
129 }
130
131 typedef struct {
132 int use_ignores;
133 uint8_t ignore_ch[256];
134 } hashsig_in_progress;
135
hashsig_in_progress_init(hashsig_in_progress * prog,git_hashsig * sig)136 static int hashsig_in_progress_init(
137 hashsig_in_progress *prog, git_hashsig *sig)
138 {
139 int i;
140
141 /* no more than one can be set */
142 GIT_ASSERT(!(sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) ||
143 !(sig->opt & GIT_HASHSIG_SMART_WHITESPACE));
144
145 if (sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) {
146 for (i = 0; i < 256; ++i)
147 prog->ignore_ch[i] = git__isspace_nonlf(i);
148 prog->use_ignores = 1;
149 } else if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) {
150 for (i = 0; i < 256; ++i)
151 prog->ignore_ch[i] = git__isspace(i);
152 prog->use_ignores = 1;
153 } else {
154 memset(prog, 0, sizeof(*prog));
155 }
156
157 return 0;
158 }
159
hashsig_add_hashes(git_hashsig * sig,const uint8_t * data,size_t size,hashsig_in_progress * prog)160 static int hashsig_add_hashes(
161 git_hashsig *sig,
162 const uint8_t *data,
163 size_t size,
164 hashsig_in_progress *prog)
165 {
166 const uint8_t *scan = data, *end = data + size;
167 hashsig_state state = HASHSIG_HASH_START;
168 int use_ignores = prog->use_ignores, len;
169 uint8_t ch;
170
171 while (scan < end) {
172 state = HASHSIG_HASH_START;
173
174 for (len = 0; scan < end && len < HASHSIG_MAX_RUN; ) {
175 ch = *scan;
176
177 if (use_ignores)
178 for (; scan < end && git__isspace_nonlf(ch); ch = *scan)
179 ++scan;
180 else if (sig->opt &
181 (GIT_HASHSIG_IGNORE_WHITESPACE | GIT_HASHSIG_SMART_WHITESPACE))
182 for (; scan < end && ch == '\r'; ch = *scan)
183 ++scan;
184
185 /* peek at next character to decide what to do next */
186 if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE)
187 use_ignores = (ch == '\n');
188
189 if (scan >= end)
190 break;
191 ++scan;
192
193 /* check run terminator */
194 if (ch == '\n' || ch == '\0') {
195 sig->lines++;
196 break;
197 }
198
199 ++len;
200 HASHSIG_HASH_MIX(state, ch);
201 }
202
203 if (len > 0) {
204 hashsig_heap_insert(&sig->mins, (hashsig_t)state);
205 hashsig_heap_insert(&sig->maxs, (hashsig_t)state);
206
207 while (scan < end && (*scan == '\n' || !*scan))
208 ++scan;
209 }
210 }
211
212 prog->use_ignores = use_ignores;
213
214 return 0;
215 }
216
hashsig_finalize_hashes(git_hashsig * sig)217 static int hashsig_finalize_hashes(git_hashsig *sig)
218 {
219 if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE &&
220 !(sig->opt & GIT_HASHSIG_ALLOW_SMALL_FILES)) {
221 git_error_set(GIT_ERROR_INVALID,
222 "file too small for similarity signature calculation");
223 return GIT_EBUFS;
224 }
225
226 hashsig_heap_sort(&sig->mins);
227 hashsig_heap_sort(&sig->maxs);
228
229 return 0;
230 }
231
hashsig_alloc(git_hashsig_option_t opts)232 static git_hashsig *hashsig_alloc(git_hashsig_option_t opts)
233 {
234 git_hashsig *sig = git__calloc(1, sizeof(git_hashsig));
235 if (!sig)
236 return NULL;
237
238 hashsig_heap_init(&sig->mins, hashsig_cmp_min);
239 hashsig_heap_init(&sig->maxs, hashsig_cmp_max);
240 sig->opt = opts;
241
242 return sig;
243 }
244
git_hashsig_create(git_hashsig ** out,const char * buf,size_t buflen,git_hashsig_option_t opts)245 int git_hashsig_create(
246 git_hashsig **out,
247 const char *buf,
248 size_t buflen,
249 git_hashsig_option_t opts)
250 {
251 int error;
252 hashsig_in_progress prog;
253 git_hashsig *sig = hashsig_alloc(opts);
254 GIT_ERROR_CHECK_ALLOC(sig);
255
256 if ((error = hashsig_in_progress_init(&prog, sig)) < 0)
257 return error;
258
259 error = hashsig_add_hashes(sig, (const uint8_t *)buf, buflen, &prog);
260
261 if (!error)
262 error = hashsig_finalize_hashes(sig);
263
264 if (!error)
265 *out = sig;
266 else
267 git_hashsig_free(sig);
268
269 return error;
270 }
271
git_hashsig_create_fromfile(git_hashsig ** out,const char * path,git_hashsig_option_t opts)272 int git_hashsig_create_fromfile(
273 git_hashsig **out,
274 const char *path,
275 git_hashsig_option_t opts)
276 {
277 uint8_t buf[0x1000];
278 ssize_t buflen = 0;
279 int error = 0, fd;
280 hashsig_in_progress prog;
281 git_hashsig *sig = hashsig_alloc(opts);
282 GIT_ERROR_CHECK_ALLOC(sig);
283
284 if ((fd = git_futils_open_ro(path)) < 0) {
285 git__free(sig);
286 return fd;
287 }
288
289 if ((error = hashsig_in_progress_init(&prog, sig)) < 0)
290 return error;
291
292 while (!error) {
293 if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) {
294 if ((error = (int)buflen) < 0)
295 git_error_set(GIT_ERROR_OS,
296 "read error on '%s' calculating similarity hashes", path);
297 break;
298 }
299
300 error = hashsig_add_hashes(sig, buf, buflen, &prog);
301 }
302
303 p_close(fd);
304
305 if (!error)
306 error = hashsig_finalize_hashes(sig);
307
308 if (!error)
309 *out = sig;
310 else
311 git_hashsig_free(sig);
312
313 return error;
314 }
315
git_hashsig_free(git_hashsig * sig)316 void git_hashsig_free(git_hashsig *sig)
317 {
318 git__free(sig);
319 }
320
hashsig_heap_compare(const hashsig_heap * a,const hashsig_heap * b)321 static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b)
322 {
323 int matches = 0, i, j, cmp;
324
325 GIT_ASSERT_WITH_RETVAL(a->cmp == b->cmp, 0);
326
327 /* hash heaps are sorted - just look for overlap vs total */
328
329 for (i = 0, j = 0; i < a->size && j < b->size; ) {
330 cmp = a->cmp(&a->values[i], &b->values[j], NULL);
331
332 if (cmp < 0)
333 ++i;
334 else if (cmp > 0)
335 ++j;
336 else {
337 ++i; ++j; ++matches;
338 }
339 }
340
341 return HASHSIG_SCALE * (matches * 2) / (a->size + b->size);
342 }
343
git_hashsig_compare(const git_hashsig * a,const git_hashsig * b)344 int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b)
345 {
346 /* if we have no elements in either file then each file is either
347 * empty or blank. if we're ignoring whitespace then the files are
348 * similar, otherwise they're dissimilar.
349 */
350 if (a->mins.size == 0 && b->mins.size == 0) {
351 if ((!a->lines && !b->lines) ||
352 (a->opt & GIT_HASHSIG_IGNORE_WHITESPACE))
353 return HASHSIG_SCALE;
354 else
355 return 0;
356 }
357
358 /* if we have fewer than the maximum number of elements, then just use
359 * one array since the two arrays will be the same
360 */
361 if (a->mins.size < HASHSIG_HEAP_SIZE) {
362 return hashsig_heap_compare(&a->mins, &b->mins);
363 } else {
364 int mins, maxs;
365
366 if ((mins = hashsig_heap_compare(&a->mins, &b->mins)) < 0)
367 return mins;
368 if ((maxs = hashsig_heap_compare(&a->maxs, &b->maxs)) < 0)
369 return maxs;
370
371 return (mins + maxs) / 2;
372 }
373 }
374