1 /*
2  * Copyright (C) the libgit2 contributors. All rights reserved.
3  *
4  * This file is part of libgit2, distributed under the GNU GPL v2 with
5  * a Linking Exception. For full terms see the included COPYING file.
6  */
7 
8 #include "common.h"
9 
10 #include "git2/sys/hashsig.h"
11 #include "futils.h"
12 #include "util.h"
13 
14 typedef uint32_t hashsig_t;
15 typedef uint64_t hashsig_state;
16 
17 #define HASHSIG_SCALE 100
18 
19 #define HASHSIG_MAX_RUN 80
20 #define HASHSIG_HASH_START	0x012345678ABCDEF0LL
21 #define HASHSIG_HASH_SHIFT  5
22 
23 #define HASHSIG_HASH_MIX(S,CH) \
24 	(S) = ((S) << HASHSIG_HASH_SHIFT) - (S) + (hashsig_state)(CH)
25 
26 #define HASHSIG_HEAP_SIZE ((1 << 7) - 1)
27 #define HASHSIG_HEAP_MIN_SIZE 4
28 
29 typedef int (*hashsig_cmp)(const void *a, const void *b, void *);
30 
31 typedef struct {
32 	int size, asize;
33 	hashsig_cmp cmp;
34 	hashsig_t values[HASHSIG_HEAP_SIZE];
35 } hashsig_heap;
36 
37 struct git_hashsig {
38 	hashsig_heap mins;
39 	hashsig_heap maxs;
40 	size_t lines;
41 	git_hashsig_option_t opt;
42 };
43 
44 #define HEAP_LCHILD_OF(I) (((I)<<1)+1)
45 #define HEAP_RCHILD_OF(I) (((I)<<1)+2)
46 #define HEAP_PARENT_OF(I) (((I)-1)>>1)
47 
hashsig_heap_init(hashsig_heap * h,hashsig_cmp cmp)48 static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp)
49 {
50 	h->size  = 0;
51 	h->asize = HASHSIG_HEAP_SIZE;
52 	h->cmp   = cmp;
53 }
54 
hashsig_cmp_max(const void * a,const void * b,void * payload)55 static int hashsig_cmp_max(const void *a, const void *b, void *payload)
56 {
57 	hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
58 	GIT_UNUSED(payload);
59 	return (av < bv) ? -1 : (av > bv) ? 1 : 0;
60 }
61 
hashsig_cmp_min(const void * a,const void * b,void * payload)62 static int hashsig_cmp_min(const void *a, const void *b, void *payload)
63 {
64 	hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
65 	GIT_UNUSED(payload);
66 	return (av > bv) ? -1 : (av < bv) ? 1 : 0;
67 }
68 
hashsig_heap_up(hashsig_heap * h,int el)69 static void hashsig_heap_up(hashsig_heap *h, int el)
70 {
71 	int parent_el = HEAP_PARENT_OF(el);
72 
73 	while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el], NULL) > 0) {
74 		hashsig_t t = h->values[el];
75 		h->values[el] = h->values[parent_el];
76 		h->values[parent_el] = t;
77 
78 		el = parent_el;
79 		parent_el = HEAP_PARENT_OF(el);
80 	}
81 }
82 
hashsig_heap_down(hashsig_heap * h,int el)83 static void hashsig_heap_down(hashsig_heap *h, int el)
84 {
85 	hashsig_t v, lv, rv;
86 
87 	/* 'el < h->size / 2' tests if el is bottom row of heap */
88 
89 	while (el < h->size / 2) {
90 		int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel;
91 
92 		v  = h->values[el];
93 		lv = h->values[lel];
94 		rv = h->values[rel];
95 
96 		if (h->cmp(&v, &lv, NULL) < 0 && h->cmp(&v, &rv, NULL) < 0)
97 			break;
98 
99 		swapel = (h->cmp(&lv, &rv, NULL) < 0) ? lel : rel;
100 
101 		h->values[el] = h->values[swapel];
102 		h->values[swapel] = v;
103 
104 		el = swapel;
105 	}
106 }
107 
hashsig_heap_sort(hashsig_heap * h)108 static void hashsig_heap_sort(hashsig_heap *h)
109 {
110 	/* only need to do this at the end for signature comparison */
111 	git__qsort_r(h->values, h->size, sizeof(hashsig_t), h->cmp, NULL);
112 }
113 
hashsig_heap_insert(hashsig_heap * h,hashsig_t val)114 static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val)
115 {
116 	/* if heap is not full, insert new element */
117 	if (h->size < h->asize) {
118 		h->values[h->size++] = val;
119 		hashsig_heap_up(h, h->size - 1);
120 	}
121 
122 	/* if heap is full, pop top if new element should replace it */
123 	else if (h->cmp(&val, &h->values[0], NULL) > 0) {
124 		h->size--;
125 		h->values[0] = h->values[h->size];
126 		hashsig_heap_down(h, 0);
127 	}
128 
129 }
130 
131 typedef struct {
132 	int use_ignores;
133 	uint8_t ignore_ch[256];
134 } hashsig_in_progress;
135 
hashsig_in_progress_init(hashsig_in_progress * prog,git_hashsig * sig)136 static int hashsig_in_progress_init(
137 	hashsig_in_progress *prog, git_hashsig *sig)
138 {
139 	int i;
140 
141 	/* no more than one can be set */
142 	GIT_ASSERT(!(sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) ||
143 		   !(sig->opt & GIT_HASHSIG_SMART_WHITESPACE));
144 
145 	if (sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) {
146 		for (i = 0; i < 256; ++i)
147 			prog->ignore_ch[i] = git__isspace_nonlf(i);
148 		prog->use_ignores = 1;
149 	} else if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) {
150 		for (i = 0; i < 256; ++i)
151 			prog->ignore_ch[i] = git__isspace(i);
152 		prog->use_ignores = 1;
153 	} else {
154 		memset(prog, 0, sizeof(*prog));
155 	}
156 
157 	return 0;
158 }
159 
hashsig_add_hashes(git_hashsig * sig,const uint8_t * data,size_t size,hashsig_in_progress * prog)160 static int hashsig_add_hashes(
161 	git_hashsig *sig,
162 	const uint8_t *data,
163 	size_t size,
164 	hashsig_in_progress *prog)
165 {
166 	const uint8_t *scan = data, *end = data + size;
167 	hashsig_state state = HASHSIG_HASH_START;
168 	int use_ignores = prog->use_ignores, len;
169 	uint8_t ch;
170 
171 	while (scan < end) {
172 		state = HASHSIG_HASH_START;
173 
174 		for (len = 0; scan < end && len < HASHSIG_MAX_RUN; ) {
175 			ch = *scan;
176 
177 			if (use_ignores)
178 				for (; scan < end && git__isspace_nonlf(ch); ch = *scan)
179 					++scan;
180 			else if (sig->opt &
181 					 (GIT_HASHSIG_IGNORE_WHITESPACE | GIT_HASHSIG_SMART_WHITESPACE))
182 				for (; scan < end && ch == '\r'; ch = *scan)
183 					++scan;
184 
185 			/* peek at next character to decide what to do next */
186 			if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE)
187 				use_ignores = (ch == '\n');
188 
189 			if (scan >= end)
190 				break;
191 			++scan;
192 
193 			/* check run terminator */
194 			if (ch == '\n' || ch == '\0') {
195 				sig->lines++;
196 				break;
197 			}
198 
199 			++len;
200 			HASHSIG_HASH_MIX(state, ch);
201 		}
202 
203 		if (len > 0) {
204 			hashsig_heap_insert(&sig->mins, (hashsig_t)state);
205 			hashsig_heap_insert(&sig->maxs, (hashsig_t)state);
206 
207 			while (scan < end && (*scan == '\n' || !*scan))
208 				++scan;
209 		}
210 	}
211 
212 	prog->use_ignores = use_ignores;
213 
214 	return 0;
215 }
216 
hashsig_finalize_hashes(git_hashsig * sig)217 static int hashsig_finalize_hashes(git_hashsig *sig)
218 {
219 	if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE &&
220 		!(sig->opt & GIT_HASHSIG_ALLOW_SMALL_FILES)) {
221 		git_error_set(GIT_ERROR_INVALID,
222 			"file too small for similarity signature calculation");
223 		return GIT_EBUFS;
224 	}
225 
226 	hashsig_heap_sort(&sig->mins);
227 	hashsig_heap_sort(&sig->maxs);
228 
229 	return 0;
230 }
231 
hashsig_alloc(git_hashsig_option_t opts)232 static git_hashsig *hashsig_alloc(git_hashsig_option_t opts)
233 {
234 	git_hashsig *sig = git__calloc(1, sizeof(git_hashsig));
235 	if (!sig)
236 		return NULL;
237 
238 	hashsig_heap_init(&sig->mins, hashsig_cmp_min);
239 	hashsig_heap_init(&sig->maxs, hashsig_cmp_max);
240 	sig->opt = opts;
241 
242 	return sig;
243 }
244 
git_hashsig_create(git_hashsig ** out,const char * buf,size_t buflen,git_hashsig_option_t opts)245 int git_hashsig_create(
246 	git_hashsig **out,
247 	const char *buf,
248 	size_t buflen,
249 	git_hashsig_option_t opts)
250 {
251 	int error;
252 	hashsig_in_progress prog;
253 	git_hashsig *sig = hashsig_alloc(opts);
254 	GIT_ERROR_CHECK_ALLOC(sig);
255 
256 	if ((error = hashsig_in_progress_init(&prog, sig)) < 0)
257 		return error;
258 
259 	error = hashsig_add_hashes(sig, (const uint8_t *)buf, buflen, &prog);
260 
261 	if (!error)
262 		error = hashsig_finalize_hashes(sig);
263 
264 	if (!error)
265 		*out = sig;
266 	else
267 		git_hashsig_free(sig);
268 
269 	return error;
270 }
271 
git_hashsig_create_fromfile(git_hashsig ** out,const char * path,git_hashsig_option_t opts)272 int git_hashsig_create_fromfile(
273 	git_hashsig **out,
274 	const char *path,
275 	git_hashsig_option_t opts)
276 {
277 	uint8_t buf[0x1000];
278 	ssize_t buflen = 0;
279 	int error = 0, fd;
280 	hashsig_in_progress prog;
281 	git_hashsig *sig = hashsig_alloc(opts);
282 	GIT_ERROR_CHECK_ALLOC(sig);
283 
284 	if ((fd = git_futils_open_ro(path)) < 0) {
285 		git__free(sig);
286 		return fd;
287 	}
288 
289 	if ((error = hashsig_in_progress_init(&prog, sig)) < 0)
290 		return error;
291 
292 	while (!error) {
293 		if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) {
294 			if ((error = (int)buflen) < 0)
295 				git_error_set(GIT_ERROR_OS,
296 					"read error on '%s' calculating similarity hashes", path);
297 			break;
298 		}
299 
300 		error = hashsig_add_hashes(sig, buf, buflen, &prog);
301 	}
302 
303 	p_close(fd);
304 
305 	if (!error)
306 		error = hashsig_finalize_hashes(sig);
307 
308 	if (!error)
309 		*out = sig;
310 	else
311 		git_hashsig_free(sig);
312 
313 	return error;
314 }
315 
git_hashsig_free(git_hashsig * sig)316 void git_hashsig_free(git_hashsig *sig)
317 {
318 	git__free(sig);
319 }
320 
hashsig_heap_compare(const hashsig_heap * a,const hashsig_heap * b)321 static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b)
322 {
323 	int matches = 0, i, j, cmp;
324 
325 	GIT_ASSERT_WITH_RETVAL(a->cmp == b->cmp, 0);
326 
327 	/* hash heaps are sorted - just look for overlap vs total */
328 
329 	for (i = 0, j = 0; i < a->size && j < b->size; ) {
330 		cmp = a->cmp(&a->values[i], &b->values[j], NULL);
331 
332 		if (cmp < 0)
333 			++i;
334 		else if (cmp > 0)
335 			++j;
336 		else {
337 			++i; ++j; ++matches;
338 		}
339 	}
340 
341 	return HASHSIG_SCALE * (matches * 2) / (a->size + b->size);
342 }
343 
git_hashsig_compare(const git_hashsig * a,const git_hashsig * b)344 int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b)
345 {
346 	/* if we have no elements in either file then each file is either
347 	 * empty or blank.  if we're ignoring whitespace then the files are
348 	 * similar, otherwise they're dissimilar.
349 	 */
350 	if (a->mins.size == 0 && b->mins.size == 0) {
351 		if ((!a->lines && !b->lines) ||
352 			(a->opt & GIT_HASHSIG_IGNORE_WHITESPACE))
353 			return HASHSIG_SCALE;
354 		else
355 			return 0;
356 	}
357 
358 	/* if we have fewer than the maximum number of elements, then just use
359 	 * one array since the two arrays will be the same
360 	 */
361 	if (a->mins.size < HASHSIG_HEAP_SIZE) {
362 		return hashsig_heap_compare(&a->mins, &b->mins);
363 	} else {
364 		int mins, maxs;
365 
366 		if ((mins = hashsig_heap_compare(&a->mins, &b->mins)) < 0)
367 			return mins;
368 		if ((maxs = hashsig_heap_compare(&a->maxs, &b->maxs)) < 0)
369 			return maxs;
370 
371 		return (mins + maxs) / 2;
372 	}
373 }
374