1 /*
2  * Copyright (C) the libgit2 contributors. All rights reserved.
3  *
4  * This file is part of libgit2, distributed under the GNU GPL v2 with
5  * a Linking Exception. For full terms see the included COPYING file.
6  */
7 
8 #include "common.h"
9 
10 #include "git2/sys/hashsig.h"
11 #include "futils.h"
12 #include "util.h"
13 
14 typedef uint32_t hashsig_t;
15 typedef uint64_t hashsig_state;
16 
17 #define HASHSIG_SCALE 100
18 
19 #define HASHSIG_MAX_RUN 80
20 #define HASHSIG_HASH_START	0x012345678ABCDEF0LL
21 #define HASHSIG_HASH_SHIFT  5
22 
23 #define HASHSIG_HASH_MIX(S,CH) \
24 	(S) = ((S) << HASHSIG_HASH_SHIFT) - (S) + (hashsig_state)(CH)
25 
26 #define HASHSIG_HEAP_SIZE ((1 << 7) - 1)
27 #define HASHSIG_HEAP_MIN_SIZE 4
28 
29 typedef int (*hashsig_cmp)(const void *a, const void *b, void *);
30 
31 typedef struct {
32 	int size, asize;
33 	hashsig_cmp cmp;
34 	hashsig_t values[HASHSIG_HEAP_SIZE];
35 } hashsig_heap;
36 
37 struct git_hashsig {
38 	hashsig_heap mins;
39 	hashsig_heap maxs;
40 	size_t lines;
41 	git_hashsig_option_t opt;
42 };
43 
44 #define HEAP_LCHILD_OF(I) (((I)<<1)+1)
45 #define HEAP_RCHILD_OF(I) (((I)<<1)+2)
46 #define HEAP_PARENT_OF(I) (((I)-1)>>1)
47 
hashsig_heap_init(hashsig_heap * h,hashsig_cmp cmp)48 static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp)
49 {
50 	h->size  = 0;
51 	h->asize = HASHSIG_HEAP_SIZE;
52 	h->cmp   = cmp;
53 }
54 
hashsig_cmp_max(const void * a,const void * b,void * payload)55 static int hashsig_cmp_max(const void *a, const void *b, void *payload)
56 {
57 	hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
58 	GIT_UNUSED(payload);
59 	return (av < bv) ? -1 : (av > bv) ? 1 : 0;
60 }
61 
hashsig_cmp_min(const void * a,const void * b,void * payload)62 static int hashsig_cmp_min(const void *a, const void *b, void *payload)
63 {
64 	hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b;
65 	GIT_UNUSED(payload);
66 	return (av > bv) ? -1 : (av < bv) ? 1 : 0;
67 }
68 
hashsig_heap_up(hashsig_heap * h,int el)69 static void hashsig_heap_up(hashsig_heap *h, int el)
70 {
71 	int parent_el = HEAP_PARENT_OF(el);
72 
73 	while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el], NULL) > 0) {
74 		hashsig_t t = h->values[el];
75 		h->values[el] = h->values[parent_el];
76 		h->values[parent_el] = t;
77 
78 		el = parent_el;
79 		parent_el = HEAP_PARENT_OF(el);
80 	}
81 }
82 
hashsig_heap_down(hashsig_heap * h,int el)83 static void hashsig_heap_down(hashsig_heap *h, int el)
84 {
85 	hashsig_t v, lv, rv;
86 
87 	/* 'el < h->size / 2' tests if el is bottom row of heap */
88 
89 	while (el < h->size / 2) {
90 		int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel;
91 
92 		v  = h->values[el];
93 		lv = h->values[lel];
94 		rv = h->values[rel];
95 
96 		if (h->cmp(&v, &lv, NULL) < 0 && h->cmp(&v, &rv, NULL) < 0)
97 			break;
98 
99 		swapel = (h->cmp(&lv, &rv, NULL) < 0) ? lel : rel;
100 
101 		h->values[el] = h->values[swapel];
102 		h->values[swapel] = v;
103 
104 		el = swapel;
105 	}
106 }
107 
hashsig_heap_sort(hashsig_heap * h)108 static void hashsig_heap_sort(hashsig_heap *h)
109 {
110 	/* only need to do this at the end for signature comparison */
111 	git__qsort_r(h->values, h->size, sizeof(hashsig_t), h->cmp, NULL);
112 }
113 
hashsig_heap_insert(hashsig_heap * h,hashsig_t val)114 static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val)
115 {
116 	/* if heap is not full, insert new element */
117 	if (h->size < h->asize) {
118 		h->values[h->size++] = val;
119 		hashsig_heap_up(h, h->size - 1);
120 	}
121 
122 	/* if heap is full, pop top if new element should replace it */
123 	else if (h->cmp(&val, &h->values[0], NULL) > 0) {
124 		h->size--;
125 		h->values[0] = h->values[h->size];
126 		hashsig_heap_down(h, 0);
127 	}
128 
129 }
130 
131 typedef struct {
132 	int use_ignores;
133 	uint8_t ignore_ch[256];
134 } hashsig_in_progress;
135 
hashsig_in_progress_init(hashsig_in_progress * prog,git_hashsig * sig)136 static void hashsig_in_progress_init(
137 	hashsig_in_progress *prog, git_hashsig *sig)
138 {
139 	int i;
140 
141 	/* no more than one can be set */
142 	assert(!(sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) ||
143 		   !(sig->opt & GIT_HASHSIG_SMART_WHITESPACE));
144 
145 	if (sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) {
146 		for (i = 0; i < 256; ++i)
147 			prog->ignore_ch[i] = git__isspace_nonlf(i);
148 		prog->use_ignores = 1;
149 	} else if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) {
150 		for (i = 0; i < 256; ++i)
151 			prog->ignore_ch[i] = git__isspace(i);
152 		prog->use_ignores = 1;
153 	} else {
154 		memset(prog, 0, sizeof(*prog));
155 	}
156 }
157 
hashsig_add_hashes(git_hashsig * sig,const uint8_t * data,size_t size,hashsig_in_progress * prog)158 static int hashsig_add_hashes(
159 	git_hashsig *sig,
160 	const uint8_t *data,
161 	size_t size,
162 	hashsig_in_progress *prog)
163 {
164 	const uint8_t *scan = data, *end = data + size;
165 	hashsig_state state = HASHSIG_HASH_START;
166 	int use_ignores = prog->use_ignores, len;
167 	uint8_t ch;
168 
169 	while (scan < end) {
170 		state = HASHSIG_HASH_START;
171 
172 		for (len = 0; scan < end && len < HASHSIG_MAX_RUN; ) {
173 			ch = *scan;
174 
175 			if (use_ignores)
176 				for (; scan < end && git__isspace_nonlf(ch); ch = *scan)
177 					++scan;
178 			else if (sig->opt &
179 					 (GIT_HASHSIG_IGNORE_WHITESPACE | GIT_HASHSIG_SMART_WHITESPACE))
180 				for (; scan < end && ch == '\r'; ch = *scan)
181 					++scan;
182 
183 			/* peek at next character to decide what to do next */
184 			if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE)
185 				use_ignores = (ch == '\n');
186 
187 			if (scan >= end)
188 				break;
189 			++scan;
190 
191 			/* check run terminator */
192 			if (ch == '\n' || ch == '\0') {
193 				sig->lines++;
194 				break;
195 			}
196 
197 			++len;
198 			HASHSIG_HASH_MIX(state, ch);
199 		}
200 
201 		if (len > 0) {
202 			hashsig_heap_insert(&sig->mins, (hashsig_t)state);
203 			hashsig_heap_insert(&sig->maxs, (hashsig_t)state);
204 
205 			while (scan < end && (*scan == '\n' || !*scan))
206 				++scan;
207 		}
208 	}
209 
210 	prog->use_ignores = use_ignores;
211 
212 	return 0;
213 }
214 
hashsig_finalize_hashes(git_hashsig * sig)215 static int hashsig_finalize_hashes(git_hashsig *sig)
216 {
217 	if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE &&
218 		!(sig->opt & GIT_HASHSIG_ALLOW_SMALL_FILES)) {
219 		git_error_set(GIT_ERROR_INVALID,
220 			"file too small for similarity signature calculation");
221 		return GIT_EBUFS;
222 	}
223 
224 	hashsig_heap_sort(&sig->mins);
225 	hashsig_heap_sort(&sig->maxs);
226 
227 	return 0;
228 }
229 
hashsig_alloc(git_hashsig_option_t opts)230 static git_hashsig *hashsig_alloc(git_hashsig_option_t opts)
231 {
232 	git_hashsig *sig = git__calloc(1, sizeof(git_hashsig));
233 	if (!sig)
234 		return NULL;
235 
236 	hashsig_heap_init(&sig->mins, hashsig_cmp_min);
237 	hashsig_heap_init(&sig->maxs, hashsig_cmp_max);
238 	sig->opt = opts;
239 
240 	return sig;
241 }
242 
git_hashsig_create(git_hashsig ** out,const char * buf,size_t buflen,git_hashsig_option_t opts)243 int git_hashsig_create(
244 	git_hashsig **out,
245 	const char *buf,
246 	size_t buflen,
247 	git_hashsig_option_t opts)
248 {
249 	int error;
250 	hashsig_in_progress prog;
251 	git_hashsig *sig = hashsig_alloc(opts);
252 	GIT_ERROR_CHECK_ALLOC(sig);
253 
254 	hashsig_in_progress_init(&prog, sig);
255 
256 	error = hashsig_add_hashes(sig, (const uint8_t *)buf, buflen, &prog);
257 
258 	if (!error)
259 		error = hashsig_finalize_hashes(sig);
260 
261 	if (!error)
262 		*out = sig;
263 	else
264 		git_hashsig_free(sig);
265 
266 	return error;
267 }
268 
git_hashsig_create_fromfile(git_hashsig ** out,const char * path,git_hashsig_option_t opts)269 int git_hashsig_create_fromfile(
270 	git_hashsig **out,
271 	const char *path,
272 	git_hashsig_option_t opts)
273 {
274 	uint8_t buf[0x1000];
275 	ssize_t buflen = 0;
276 	int error = 0, fd;
277 	hashsig_in_progress prog;
278 	git_hashsig *sig = hashsig_alloc(opts);
279 	GIT_ERROR_CHECK_ALLOC(sig);
280 
281 	if ((fd = git_futils_open_ro(path)) < 0) {
282 		git__free(sig);
283 		return fd;
284 	}
285 
286 	hashsig_in_progress_init(&prog, sig);
287 
288 	while (!error) {
289 		if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) {
290 			if ((error = (int)buflen) < 0)
291 				git_error_set(GIT_ERROR_OS,
292 					"read error on '%s' calculating similarity hashes", path);
293 			break;
294 		}
295 
296 		error = hashsig_add_hashes(sig, buf, buflen, &prog);
297 	}
298 
299 	p_close(fd);
300 
301 	if (!error)
302 		error = hashsig_finalize_hashes(sig);
303 
304 	if (!error)
305 		*out = sig;
306 	else
307 		git_hashsig_free(sig);
308 
309 	return error;
310 }
311 
git_hashsig_free(git_hashsig * sig)312 void git_hashsig_free(git_hashsig *sig)
313 {
314 	git__free(sig);
315 }
316 
hashsig_heap_compare(const hashsig_heap * a,const hashsig_heap * b)317 static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b)
318 {
319 	int matches = 0, i, j, cmp;
320 
321 	assert(a->cmp == b->cmp);
322 
323 	/* hash heaps are sorted - just look for overlap vs total */
324 
325 	for (i = 0, j = 0; i < a->size && j < b->size; ) {
326 		cmp = a->cmp(&a->values[i], &b->values[j], NULL);
327 
328 		if (cmp < 0)
329 			++i;
330 		else if (cmp > 0)
331 			++j;
332 		else {
333 			++i; ++j; ++matches;
334 		}
335 	}
336 
337 	return HASHSIG_SCALE * (matches * 2) / (a->size + b->size);
338 }
339 
git_hashsig_compare(const git_hashsig * a,const git_hashsig * b)340 int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b)
341 {
342 	/* if we have no elements in either file then each file is either
343 	 * empty or blank.  if we're ignoring whitespace then the files are
344 	 * similar, otherwise they're dissimilar.
345 	 */
346 	if (a->mins.size == 0 && b->mins.size == 0) {
347 		if ((!a->lines && !b->lines) ||
348 			(a->opt & GIT_HASHSIG_IGNORE_WHITESPACE))
349 			return HASHSIG_SCALE;
350 		else
351 			return 0;
352 	}
353 
354 	/* if we have fewer than the maximum number of elements, then just use
355 	 * one array since the two arrays will be the same
356 	 */
357 	if (a->mins.size < HASHSIG_HEAP_SIZE)
358 		return hashsig_heap_compare(&a->mins, &b->mins);
359 	else
360 		return (hashsig_heap_compare(&a->mins, &b->mins) +
361 				hashsig_heap_compare(&a->maxs, &b->maxs)) / 2;
362 }
363