1 /*
2  * Copyright (C) the libgit2 contributors. All rights reserved.
3  *
4  * This file is part of libgit2, distributed under the GNU GPL v2 with
5  * a Linking Exception. For full terms see the included COPYING file.
6  */
7 
8 #include "diff_tform.h"
9 
10 #include "git2/config.h"
11 #include "git2/blob.h"
12 #include "git2/sys/hashsig.h"
13 
14 #include "diff.h"
15 #include "diff_generate.h"
16 #include "path.h"
17 #include "futils.h"
18 #include "config.h"
19 
git_diff__delta_dup(const git_diff_delta * d,git_pool * pool)20 git_diff_delta *git_diff__delta_dup(
21 	const git_diff_delta *d, git_pool *pool)
22 {
23 	git_diff_delta *delta = git__malloc(sizeof(git_diff_delta));
24 	if (!delta)
25 		return NULL;
26 
27 	memcpy(delta, d, sizeof(git_diff_delta));
28 	GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags);
29 
30 	if (d->old_file.path != NULL) {
31 		delta->old_file.path = git_pool_strdup(pool, d->old_file.path);
32 		if (delta->old_file.path == NULL)
33 			goto fail;
34 	}
35 
36 	if (d->new_file.path != d->old_file.path && d->new_file.path != NULL) {
37 		delta->new_file.path = git_pool_strdup(pool, d->new_file.path);
38 		if (delta->new_file.path == NULL)
39 			goto fail;
40 	} else {
41 		delta->new_file.path = delta->old_file.path;
42 	}
43 
44 	return delta;
45 
46 fail:
47 	git__free(delta);
48 	return NULL;
49 }
50 
git_diff__merge_like_cgit(const git_diff_delta * a,const git_diff_delta * b,git_pool * pool)51 git_diff_delta *git_diff__merge_like_cgit(
52 	const git_diff_delta *a,
53 	const git_diff_delta *b,
54 	git_pool *pool)
55 {
56 	git_diff_delta *dup;
57 
58 	/* Emulate C git for merging two diffs (a la 'git diff <sha>').
59 	 *
60 	 * When C git does a diff between the work dir and a tree, it actually
61 	 * diffs with the index but uses the workdir contents.  This emulates
62 	 * those choices so we can emulate the type of diff.
63 	 *
64 	 * We have three file descriptions here, let's call them:
65 	 *  f1 = a->old_file
66 	 *  f2 = a->new_file AND b->old_file
67 	 *  f3 = b->new_file
68 	 */
69 
70 	/* If one of the diffs is a conflict, just dup it */
71 	if (b->status == GIT_DELTA_CONFLICTED)
72 		return git_diff__delta_dup(b, pool);
73 	if (a->status == GIT_DELTA_CONFLICTED)
74 		return git_diff__delta_dup(a, pool);
75 
76 	/* if f2 == f3 or f2 is deleted, then just dup the 'a' diff */
77 	if (b->status == GIT_DELTA_UNMODIFIED || a->status == GIT_DELTA_DELETED)
78 		return git_diff__delta_dup(a, pool);
79 
80 	/* otherwise, base this diff on the 'b' diff */
81 	if ((dup = git_diff__delta_dup(b, pool)) == NULL)
82 		return NULL;
83 
84 	/* If 'a' status is uninteresting, then we're done */
85 	if (a->status == GIT_DELTA_UNMODIFIED ||
86 		a->status == GIT_DELTA_UNTRACKED ||
87 		a->status == GIT_DELTA_UNREADABLE)
88 		return dup;
89 
90 	GIT_ASSERT_WITH_RETVAL(b->status != GIT_DELTA_UNMODIFIED, NULL);
91 
92 	/* A cgit exception is that the diff of a file that is only in the
93 	 * index (i.e. not in HEAD nor workdir) is given as empty.
94 	 */
95 	if (dup->status == GIT_DELTA_DELETED) {
96 		if (a->status == GIT_DELTA_ADDED) {
97 			dup->status = GIT_DELTA_UNMODIFIED;
98 			dup->nfiles = 2;
99 		}
100 		/* else don't overwrite DELETE status */
101 	} else {
102 		dup->status = a->status;
103 		dup->nfiles = a->nfiles;
104 	}
105 
106 	git_oid_cpy(&dup->old_file.id, &a->old_file.id);
107 	dup->old_file.mode  = a->old_file.mode;
108 	dup->old_file.size  = a->old_file.size;
109 	dup->old_file.flags = a->old_file.flags;
110 
111 	return dup;
112 }
113 
git_diff__merge(git_diff * onto,const git_diff * from,git_diff__merge_cb cb)114 int git_diff__merge(
115 	git_diff *onto, const git_diff *from, git_diff__merge_cb cb)
116 {
117 	int error = 0;
118 	git_pool onto_pool;
119 	git_vector onto_new;
120 	git_diff_delta *delta;
121 	bool ignore_case, reversed;
122 	unsigned int i, j;
123 
124 	GIT_ASSERT_ARG(onto);
125 	GIT_ASSERT_ARG(from);
126 
127 	if (!from->deltas.length)
128 		return 0;
129 
130 	ignore_case = ((onto->opts.flags & GIT_DIFF_IGNORE_CASE) != 0);
131 	reversed    = ((onto->opts.flags & GIT_DIFF_REVERSE) != 0);
132 
133 	if (ignore_case != ((from->opts.flags & GIT_DIFF_IGNORE_CASE) != 0) ||
134 		reversed    != ((from->opts.flags & GIT_DIFF_REVERSE) != 0)) {
135 		git_error_set(GIT_ERROR_INVALID,
136 			"attempt to merge diffs created with conflicting options");
137 		return -1;
138 	}
139 
140 	if (git_vector_init(&onto_new, onto->deltas.length, git_diff_delta__cmp) < 0 ||
141 	    git_pool_init(&onto_pool, 1) < 0)
142 		return -1;
143 
144 	for (i = 0, j = 0; i < onto->deltas.length || j < from->deltas.length; ) {
145 		git_diff_delta *o = GIT_VECTOR_GET(&onto->deltas, i);
146 		const git_diff_delta *f = GIT_VECTOR_GET(&from->deltas, j);
147 		int cmp = !f ? -1 : !o ? 1 :
148 			STRCMP_CASESELECT(ignore_case, o->old_file.path, f->old_file.path);
149 
150 		if (cmp < 0) {
151 			delta = git_diff__delta_dup(o, &onto_pool);
152 			i++;
153 		} else if (cmp > 0) {
154 			delta = git_diff__delta_dup(f, &onto_pool);
155 			j++;
156 		} else {
157 			const git_diff_delta *left = reversed ? f : o;
158 			const git_diff_delta *right = reversed ? o : f;
159 
160 			delta = cb(left, right, &onto_pool);
161 			i++;
162 			j++;
163 		}
164 
165 		/* the ignore rules for the target may not match the source
166 		 * or the result of a merged delta could be skippable...
167 		 */
168 		if (delta && git_diff_delta__should_skip(&onto->opts, delta)) {
169 			git__free(delta);
170 			continue;
171 		}
172 
173 		if ((error = !delta ? -1 : git_vector_insert(&onto_new, delta)) < 0)
174 			break;
175 	}
176 
177 	if (!error) {
178 		git_vector_swap(&onto->deltas, &onto_new);
179 		git_pool_swap(&onto->pool, &onto_pool);
180 
181 		if ((onto->opts.flags & GIT_DIFF_REVERSE) != 0)
182 			onto->old_src = from->old_src;
183 		else
184 			onto->new_src = from->new_src;
185 
186 		/* prefix strings also come from old pool, so recreate those.*/
187 		onto->opts.old_prefix =
188 			git_pool_strdup_safe(&onto->pool, onto->opts.old_prefix);
189 		onto->opts.new_prefix =
190 			git_pool_strdup_safe(&onto->pool, onto->opts.new_prefix);
191 	}
192 
193 	git_vector_free_deep(&onto_new);
194 	git_pool_clear(&onto_pool);
195 
196 	return error;
197 }
198 
git_diff_merge(git_diff * onto,const git_diff * from)199 int git_diff_merge(git_diff *onto, const git_diff *from)
200 {
201 	return git_diff__merge(onto, from, git_diff__merge_like_cgit);
202 }
203 
git_diff_find_similar__hashsig_for_file(void ** out,const git_diff_file * f,const char * path,void * p)204 int git_diff_find_similar__hashsig_for_file(
205 	void **out, const git_diff_file *f, const char *path, void *p)
206 {
207 	git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
208 
209 	GIT_UNUSED(f);
210 	return git_hashsig_create_fromfile((git_hashsig **)out, path, opt);
211 }
212 
git_diff_find_similar__hashsig_for_buf(void ** out,const git_diff_file * f,const char * buf,size_t len,void * p)213 int git_diff_find_similar__hashsig_for_buf(
214 	void **out, const git_diff_file *f, const char *buf, size_t len, void *p)
215 {
216 	git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
217 
218 	GIT_UNUSED(f);
219 	return git_hashsig_create((git_hashsig **)out, buf, len, opt);
220 }
221 
git_diff_find_similar__hashsig_free(void * sig,void * payload)222 void git_diff_find_similar__hashsig_free(void *sig, void *payload)
223 {
224 	GIT_UNUSED(payload);
225 	git_hashsig_free(sig);
226 }
227 
git_diff_find_similar__calc_similarity(int * score,void * siga,void * sigb,void * payload)228 int git_diff_find_similar__calc_similarity(
229 	int *score, void *siga, void *sigb, void *payload)
230 {
231 	int error;
232 
233 	GIT_UNUSED(payload);
234 	error = git_hashsig_compare(siga, sigb);
235 	if (error < 0)
236 		return error;
237 
238 	*score = error;
239 	return 0;
240 }
241 
242 #define DEFAULT_THRESHOLD 50
243 #define DEFAULT_BREAK_REWRITE_THRESHOLD 60
244 #define DEFAULT_RENAME_LIMIT 200
245 
normalize_find_opts(git_diff * diff,git_diff_find_options * opts,const git_diff_find_options * given)246 static int normalize_find_opts(
247 	git_diff *diff,
248 	git_diff_find_options *opts,
249 	const git_diff_find_options *given)
250 {
251 	git_config *cfg = NULL;
252 	git_hashsig_option_t hashsig_opts;
253 
254 	GIT_ERROR_CHECK_VERSION(given, GIT_DIFF_FIND_OPTIONS_VERSION, "git_diff_find_options");
255 
256 	if (diff->repo != NULL &&
257 		git_repository_config__weakptr(&cfg, diff->repo) < 0)
258 		return -1;
259 
260 	if (given)
261 		memcpy(opts, given, sizeof(*opts));
262 
263 	if (!given ||
264 		 (given->flags & GIT_DIFF_FIND_ALL) == GIT_DIFF_FIND_BY_CONFIG)
265 	{
266 		if (cfg) {
267 			char *rule =
268 				git_config__get_string_force(cfg, "diff.renames", "true");
269 			int boolval;
270 
271 			if (!git__parse_bool(&boolval, rule) && !boolval)
272 				/* don't set FIND_RENAMES if bool value is false */;
273 			else if (!strcasecmp(rule, "copies") || !strcasecmp(rule, "copy"))
274 				opts->flags |= GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES;
275 			else
276 				opts->flags |= GIT_DIFF_FIND_RENAMES;
277 
278 			git__free(rule);
279 		} else {
280 			/* set default flag */
281 			opts->flags |= GIT_DIFF_FIND_RENAMES;
282 		}
283 	}
284 
285 	/* some flags imply others */
286 
287 	if (opts->flags & GIT_DIFF_FIND_EXACT_MATCH_ONLY) {
288 		/* if we are only looking for exact matches, then don't turn
289 		 * MODIFIED items into ADD/DELETE pairs because it's too picky
290 		 */
291 		opts->flags &= ~(GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES);
292 
293 		/* similarly, don't look for self-rewrites to split */
294 		opts->flags &= ~GIT_DIFF_FIND_RENAMES_FROM_REWRITES;
295 	}
296 
297 	if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES)
298 		opts->flags |= GIT_DIFF_FIND_RENAMES;
299 
300 	if (opts->flags & GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)
301 		opts->flags |= GIT_DIFF_FIND_COPIES;
302 
303 	if (opts->flags & GIT_DIFF_BREAK_REWRITES)
304 		opts->flags |= GIT_DIFF_FIND_REWRITES;
305 
306 #define USE_DEFAULT(X) ((X) == 0 || (X) > 100)
307 
308 	if (USE_DEFAULT(opts->rename_threshold))
309 		opts->rename_threshold = DEFAULT_THRESHOLD;
310 
311 	if (USE_DEFAULT(opts->rename_from_rewrite_threshold))
312 		opts->rename_from_rewrite_threshold = DEFAULT_THRESHOLD;
313 
314 	if (USE_DEFAULT(opts->copy_threshold))
315 		opts->copy_threshold = DEFAULT_THRESHOLD;
316 
317 	if (USE_DEFAULT(opts->break_rewrite_threshold))
318 		opts->break_rewrite_threshold = DEFAULT_BREAK_REWRITE_THRESHOLD;
319 
320 #undef USE_DEFAULT
321 
322 	if (!opts->rename_limit) {
323 		if (cfg) {
324 			opts->rename_limit = git_config__get_int_force(
325 				cfg, "diff.renamelimit", DEFAULT_RENAME_LIMIT);
326 		}
327 
328 		if (opts->rename_limit <= 0)
329 			opts->rename_limit = DEFAULT_RENAME_LIMIT;
330 	}
331 
332 	/* assign the internal metric with whitespace flag as payload */
333 	if (!opts->metric) {
334 		opts->metric = git__malloc(sizeof(git_diff_similarity_metric));
335 		GIT_ERROR_CHECK_ALLOC(opts->metric);
336 
337 		opts->metric->file_signature = git_diff_find_similar__hashsig_for_file;
338 		opts->metric->buffer_signature = git_diff_find_similar__hashsig_for_buf;
339 		opts->metric->free_signature = git_diff_find_similar__hashsig_free;
340 		opts->metric->similarity = git_diff_find_similar__calc_similarity;
341 
342 		if (opts->flags & GIT_DIFF_FIND_IGNORE_WHITESPACE)
343 			hashsig_opts = GIT_HASHSIG_IGNORE_WHITESPACE;
344 		else if (opts->flags & GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE)
345 			hashsig_opts = GIT_HASHSIG_NORMAL;
346 		else
347 			hashsig_opts = GIT_HASHSIG_SMART_WHITESPACE;
348 		hashsig_opts |= GIT_HASHSIG_ALLOW_SMALL_FILES;
349 		opts->metric->payload = (void *)hashsig_opts;
350 	}
351 
352 	return 0;
353 }
354 
insert_delete_side_of_split(git_diff * diff,git_vector * onto,const git_diff_delta * delta)355 static int insert_delete_side_of_split(
356 	git_diff *diff, git_vector *onto, const git_diff_delta *delta)
357 {
358 	/* make new record for DELETED side of split */
359 	git_diff_delta *deleted = git_diff__delta_dup(delta, &diff->pool);
360 	GIT_ERROR_CHECK_ALLOC(deleted);
361 
362 	deleted->status = GIT_DELTA_DELETED;
363 	deleted->nfiles = 1;
364 	memset(&deleted->new_file, 0, sizeof(deleted->new_file));
365 	deleted->new_file.path = deleted->old_file.path;
366 	deleted->new_file.flags |= GIT_DIFF_FLAG_VALID_ID;
367 
368 	return git_vector_insert(onto, deleted);
369 }
370 
apply_splits_and_deletes(git_diff * diff,size_t expected_size,bool actually_split)371 static int apply_splits_and_deletes(
372 	git_diff *diff, size_t expected_size, bool actually_split)
373 {
374 	git_vector onto = GIT_VECTOR_INIT;
375 	size_t i;
376 	git_diff_delta *delta;
377 
378 	if (git_vector_init(&onto, expected_size, git_diff_delta__cmp) < 0)
379 		return -1;
380 
381 	/* build new delta list without TO_DELETE and splitting TO_SPLIT */
382 	git_vector_foreach(&diff->deltas, i, delta) {
383 		if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0)
384 			continue;
385 
386 		if ((delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0 && actually_split) {
387 			delta->similarity = 0;
388 
389 			if (insert_delete_side_of_split(diff, &onto, delta) < 0)
390 				goto on_error;
391 
392 			if (diff->new_src == GIT_ITERATOR_WORKDIR)
393 				delta->status = GIT_DELTA_UNTRACKED;
394 			else
395 				delta->status = GIT_DELTA_ADDED;
396 			delta->nfiles = 1;
397 			memset(&delta->old_file, 0, sizeof(delta->old_file));
398 			delta->old_file.path = delta->new_file.path;
399 			delta->old_file.flags |= GIT_DIFF_FLAG_VALID_ID;
400 		}
401 
402 		/* clean up delta before inserting into new list */
403 		GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags);
404 
405 		if (delta->status != GIT_DELTA_COPIED &&
406 			delta->status != GIT_DELTA_RENAMED &&
407 			(delta->status != GIT_DELTA_MODIFIED || actually_split))
408 			delta->similarity = 0;
409 
410 		/* insert into new list */
411 		if (git_vector_insert(&onto, delta) < 0)
412 			goto on_error;
413 	}
414 
415 	/* cannot return an error past this point */
416 
417 	/* free deltas from old list that didn't make it to the new one */
418 	git_vector_foreach(&diff->deltas, i, delta) {
419 		if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0)
420 			git__free(delta);
421 	}
422 
423 	/* swap new delta list into place */
424 	git_vector_swap(&diff->deltas, &onto);
425 	git_vector_free(&onto);
426 	git_vector_sort(&diff->deltas);
427 
428 	return 0;
429 
430 on_error:
431 	git_vector_free_deep(&onto);
432 
433 	return -1;
434 }
435 
similarity_get_file(git_diff * diff,size_t idx)436 GIT_INLINE(git_diff_file *) similarity_get_file(git_diff *diff, size_t idx)
437 {
438 	git_diff_delta *delta = git_vector_get(&diff->deltas, idx / 2);
439 	return (idx & 1) ? &delta->new_file : &delta->old_file;
440 }
441 
442 typedef struct {
443 	size_t idx;
444 	git_iterator_t src;
445 	git_repository *repo;
446 	git_diff_file *file;
447 	git_buf data;
448 	git_odb_object *odb_obj;
449 	git_blob *blob;
450 } similarity_info;
451 
similarity_init(similarity_info * info,git_diff * diff,size_t file_idx)452 static int similarity_init(
453 	similarity_info *info, git_diff *diff, size_t file_idx)
454 {
455 	info->idx  = file_idx;
456 	info->src  = (file_idx & 1) ? diff->new_src : diff->old_src;
457 	info->repo = diff->repo;
458 	info->file = similarity_get_file(diff, file_idx);
459 	info->odb_obj = NULL;
460 	info->blob = NULL;
461 	git_buf_init(&info->data, 0);
462 
463 	if (info->file->size > 0 || info->src == GIT_ITERATOR_WORKDIR)
464 		return 0;
465 
466 	return git_diff_file__resolve_zero_size(
467 		info->file, &info->odb_obj, info->repo);
468 }
469 
similarity_sig(similarity_info * info,const git_diff_find_options * opts,void ** cache)470 static int similarity_sig(
471 	similarity_info *info,
472 	const git_diff_find_options *opts,
473 	void **cache)
474 {
475 	int error = 0;
476 	git_diff_file *file = info->file;
477 
478 	if (info->src == GIT_ITERATOR_WORKDIR) {
479 		if ((error = git_repository_workdir_path(
480 			&info->data, info->repo, file->path)) < 0)
481 			return error;
482 
483 		/* if path is not a regular file, just skip this item */
484 		if (!git_path_isfile(info->data.ptr))
485 			return 0;
486 
487 		/* TODO: apply wd-to-odb filters to file data if necessary */
488 
489 		error = opts->metric->file_signature(
490 			&cache[info->idx], info->file,
491 			info->data.ptr, opts->metric->payload);
492 	} else {
493 		/* if we didn't initially know the size, we might have an odb_obj
494 		 * around from earlier, so convert that, otherwise load the blob now
495 		 */
496 		if (info->odb_obj != NULL)
497 			error = git_object__from_odb_object(
498 				(git_object **)&info->blob, info->repo,
499 				info->odb_obj, GIT_OBJECT_BLOB);
500 		else
501 			error = git_blob_lookup(&info->blob, info->repo, &file->id);
502 
503 		if (error < 0) {
504 			/* if lookup fails, just skip this item in similarity calc */
505 			git_error_clear();
506 		} else {
507 			size_t sz;
508 
509 			/* index size may not be actual blob size if filtered */
510 			if (file->size != git_blob_rawsize(info->blob))
511 				file->size = git_blob_rawsize(info->blob);
512 
513 			sz = git__is_sizet(file->size) ? (size_t)file->size : (size_t)-1;
514 
515 			error = opts->metric->buffer_signature(
516 				&cache[info->idx], info->file,
517 				git_blob_rawcontent(info->blob), sz, opts->metric->payload);
518 		}
519 	}
520 
521 	return error;
522 }
523 
similarity_unload(similarity_info * info)524 static void similarity_unload(similarity_info *info)
525 {
526 	if (info->odb_obj)
527 		git_odb_object_free(info->odb_obj);
528 
529 	if (info->blob)
530 		git_blob_free(info->blob);
531 	else
532 		git_buf_dispose(&info->data);
533 }
534 
535 #define FLAG_SET(opts,flag_name) (((opts)->flags & flag_name) != 0)
536 
537 /* - score < 0 means files cannot be compared
538  * - score >= 100 means files are exact match
539  * - score == 0 means files are completely different
540  */
similarity_measure(int * score,git_diff * diff,const git_diff_find_options * opts,void ** cache,size_t a_idx,size_t b_idx)541 static int similarity_measure(
542 	int *score,
543 	git_diff *diff,
544 	const git_diff_find_options *opts,
545 	void **cache,
546 	size_t a_idx,
547 	size_t b_idx)
548 {
549 	git_diff_file *a_file = similarity_get_file(diff, a_idx);
550 	git_diff_file *b_file = similarity_get_file(diff, b_idx);
551 	bool exact_match = FLAG_SET(opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY);
552 	int error = 0;
553 	similarity_info a_info, b_info;
554 
555 	*score = -1;
556 
557 	/* don't try to compare things that aren't files */
558 	if (!GIT_MODE_ISBLOB(a_file->mode) || !GIT_MODE_ISBLOB(b_file->mode))
559 		return 0;
560 
561 	/* if exact match is requested, force calculation of missing OIDs now */
562 	if (exact_match) {
563 		if (git_oid_is_zero(&a_file->id) &&
564 			diff->old_src == GIT_ITERATOR_WORKDIR &&
565 			!git_diff__oid_for_file(&a_file->id,
566 				diff, a_file->path, a_file->mode, a_file->size))
567 			a_file->flags |= GIT_DIFF_FLAG_VALID_ID;
568 
569 		if (git_oid_is_zero(&b_file->id) &&
570 			diff->new_src == GIT_ITERATOR_WORKDIR &&
571 			!git_diff__oid_for_file(&b_file->id,
572 				diff, b_file->path, b_file->mode, b_file->size))
573 			b_file->flags |= GIT_DIFF_FLAG_VALID_ID;
574 	}
575 
576 	/* check OID match as a quick test */
577 	if (git_oid__cmp(&a_file->id, &b_file->id) == 0) {
578 		*score = 100;
579 		return 0;
580 	}
581 
582 	/* don't calculate signatures if we are doing exact match */
583 	if (exact_match) {
584 		*score = 0;
585 		return 0;
586 	}
587 
588 	memset(&a_info, 0, sizeof(a_info));
589 	memset(&b_info, 0, sizeof(b_info));
590 
591 	/* set up similarity data (will try to update missing file sizes) */
592 	if (!cache[a_idx] && (error = similarity_init(&a_info, diff, a_idx)) < 0)
593 		return error;
594 	if (!cache[b_idx] && (error = similarity_init(&b_info, diff, b_idx)) < 0)
595 		goto cleanup;
596 
597 	/* check if file sizes are nowhere near each other */
598 	if (a_file->size > 127 &&
599 		b_file->size > 127 &&
600 		(a_file->size > (b_file->size << 3) ||
601 		 b_file->size > (a_file->size << 3)))
602 		goto cleanup;
603 
604 	/* update signature cache if needed */
605 	if (!cache[a_idx]) {
606 		if ((error = similarity_sig(&a_info, opts, cache)) < 0)
607 			goto cleanup;
608 	}
609 	if (!cache[b_idx]) {
610 		if ((error = similarity_sig(&b_info, opts, cache)) < 0)
611 			goto cleanup;
612 	}
613 
614 	/* calculate similarity provided that the metric choose to process
615 	 * both the a and b files (some may not if file is too big, etc).
616 	 */
617 	if (cache[a_idx] && cache[b_idx])
618 		error = opts->metric->similarity(
619 			score, cache[a_idx], cache[b_idx], opts->metric->payload);
620 
621 cleanup:
622 	similarity_unload(&a_info);
623 	similarity_unload(&b_info);
624 
625 	return error;
626 }
627 
calc_self_similarity(git_diff * diff,const git_diff_find_options * opts,size_t delta_idx,void ** cache)628 static int calc_self_similarity(
629 	git_diff *diff,
630 	const git_diff_find_options *opts,
631 	size_t delta_idx,
632 	void **cache)
633 {
634 	int error, similarity = -1;
635 	git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
636 
637 	if ((delta->flags & GIT_DIFF_FLAG__HAS_SELF_SIMILARITY) != 0)
638 		return 0;
639 
640 	error = similarity_measure(
641 		&similarity, diff, opts, cache, 2 * delta_idx, 2 * delta_idx + 1);
642 	if (error < 0)
643 		return error;
644 
645 	if (similarity >= 0) {
646 		delta->similarity = (uint16_t)similarity;
647 		delta->flags |= GIT_DIFF_FLAG__HAS_SELF_SIMILARITY;
648 	}
649 
650 	return 0;
651 }
652 
is_rename_target(git_diff * diff,const git_diff_find_options * opts,size_t delta_idx,void ** cache)653 static bool is_rename_target(
654 	git_diff *diff,
655 	const git_diff_find_options *opts,
656 	size_t delta_idx,
657 	void **cache)
658 {
659 	git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
660 
661 	/* skip things that aren't plain blobs */
662 	if (!GIT_MODE_ISBLOB(delta->new_file.mode))
663 		return false;
664 
665 	/* only consider ADDED, RENAMED, COPIED, and split MODIFIED as
666 	 * targets; maybe include UNTRACKED if requested.
667 	 */
668 	switch (delta->status) {
669 	case GIT_DELTA_UNMODIFIED:
670 	case GIT_DELTA_DELETED:
671 	case GIT_DELTA_IGNORED:
672 	case GIT_DELTA_CONFLICTED:
673 		return false;
674 
675 	case GIT_DELTA_MODIFIED:
676 		if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) &&
677 			!FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES))
678 			return false;
679 
680 		if (calc_self_similarity(diff, opts, delta_idx, cache) < 0)
681 			return false;
682 
683 		if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) &&
684 			delta->similarity < opts->break_rewrite_threshold) {
685 			delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
686 			break;
687 		}
688 		if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
689 			delta->similarity < opts->rename_from_rewrite_threshold) {
690 			delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
691 			break;
692 		}
693 
694 		return false;
695 
696 	case GIT_DELTA_UNTRACKED:
697 		if (!FLAG_SET(opts, GIT_DIFF_FIND_FOR_UNTRACKED))
698 			return false;
699 		break;
700 
701 	default: /* all other status values should be checked */
702 		break;
703 	}
704 
705 	delta->flags |= GIT_DIFF_FLAG__IS_RENAME_TARGET;
706 	return true;
707 }
708 
is_rename_source(git_diff * diff,const git_diff_find_options * opts,size_t delta_idx,void ** cache)709 static bool is_rename_source(
710 	git_diff *diff,
711 	const git_diff_find_options *opts,
712 	size_t delta_idx,
713 	void **cache)
714 {
715 	git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
716 
717 	/* skip things that aren't blobs */
718 	if (!GIT_MODE_ISBLOB(delta->old_file.mode))
719 		return false;
720 
721 	switch (delta->status) {
722 	case GIT_DELTA_ADDED:
723 	case GIT_DELTA_UNTRACKED:
724 	case GIT_DELTA_UNREADABLE:
725 	case GIT_DELTA_IGNORED:
726 	case GIT_DELTA_CONFLICTED:
727 		return false;
728 
729 	case GIT_DELTA_DELETED:
730 	case GIT_DELTA_TYPECHANGE:
731 		break;
732 
733 	case GIT_DELTA_UNMODIFIED:
734 		if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED))
735 			return false;
736 		if (FLAG_SET(opts, GIT_DIFF_FIND_REMOVE_UNMODIFIED))
737 			delta->flags |= GIT_DIFF_FLAG__TO_DELETE;
738 		break;
739 
740 	default: /* MODIFIED, RENAMED, COPIED */
741 		/* if we're finding copies, this could be a source */
742 		if (FLAG_SET(opts, GIT_DIFF_FIND_COPIES))
743 			break;
744 
745 		/* otherwise, this is only a source if we can split it */
746 		if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) &&
747 			!FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES))
748 			return false;
749 
750 		if (calc_self_similarity(diff, opts, delta_idx, cache) < 0)
751 			return false;
752 
753 		if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) &&
754 			delta->similarity < opts->break_rewrite_threshold) {
755 			delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
756 			break;
757 		}
758 
759 		if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
760 			delta->similarity < opts->rename_from_rewrite_threshold)
761 			break;
762 
763 		return false;
764 	}
765 
766 	delta->flags |= GIT_DIFF_FLAG__IS_RENAME_SOURCE;
767 	return true;
768 }
769 
delta_is_split(git_diff_delta * delta)770 GIT_INLINE(bool) delta_is_split(git_diff_delta *delta)
771 {
772 	return (delta->status == GIT_DELTA_TYPECHANGE ||
773 			(delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0);
774 }
775 
delta_is_new_only(git_diff_delta * delta)776 GIT_INLINE(bool) delta_is_new_only(git_diff_delta *delta)
777 {
778 	return (delta->status == GIT_DELTA_ADDED ||
779 			delta->status == GIT_DELTA_UNTRACKED ||
780 			delta->status == GIT_DELTA_UNREADABLE ||
781 			delta->status == GIT_DELTA_IGNORED);
782 }
783 
delta_make_rename(git_diff_delta * to,const git_diff_delta * from,uint16_t similarity)784 GIT_INLINE(void) delta_make_rename(
785 	git_diff_delta *to, const git_diff_delta *from, uint16_t similarity)
786 {
787 	to->status     = GIT_DELTA_RENAMED;
788 	to->similarity = similarity;
789 	to->nfiles     = 2;
790 	memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
791 	to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
792 }
793 
794 typedef struct {
795 	size_t   idx;
796 	uint16_t similarity;
797 } diff_find_match;
798 
git_diff_find_similar(git_diff * diff,const git_diff_find_options * given_opts)799 int git_diff_find_similar(
800 	git_diff *diff,
801 	const git_diff_find_options *given_opts)
802 {
803 	size_t s, t;
804 	int error = 0, result;
805 	uint16_t similarity;
806 	git_diff_delta *src, *tgt;
807 	git_diff_find_options opts = GIT_DIFF_FIND_OPTIONS_INIT;
808 	size_t num_deltas, num_srcs = 0, num_tgts = 0;
809 	size_t tried_srcs = 0, tried_tgts = 0;
810 	size_t num_rewrites = 0, num_updates = 0, num_bumped = 0;
811 	size_t sigcache_size;
812 	void **sigcache = NULL; /* cache of similarity metric file signatures */
813 	diff_find_match *tgt2src = NULL;
814 	diff_find_match *src2tgt = NULL;
815 	diff_find_match *tgt2src_copy = NULL;
816 	diff_find_match *best_match;
817 	git_diff_file swap;
818 
819 	GIT_ASSERT_ARG(diff);
820 
821 	if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0)
822 		return error;
823 
824 	num_deltas = diff->deltas.length;
825 
826 	/* TODO: maybe abort if deltas.length > rename_limit ??? */
827 	if (!num_deltas || !git__is_uint32(num_deltas))
828 		goto cleanup;
829 
830 	/* No flags set; nothing to do */
831 	if ((opts.flags & GIT_DIFF_FIND_ALL) == 0)
832 		goto cleanup;
833 
834 	GIT_ERROR_CHECK_ALLOC_MULTIPLY(&sigcache_size, num_deltas, 2);
835 	sigcache = git__calloc(sigcache_size, sizeof(void *));
836 	GIT_ERROR_CHECK_ALLOC(sigcache);
837 
838 	/* Label rename sources and targets
839 	 *
840 	 * This will also set self-similarity scores for MODIFIED files and
841 	 * mark them for splitting if break-rewrites is enabled
842 	 */
843 	git_vector_foreach(&diff->deltas, t, tgt) {
844 		if (is_rename_source(diff, &opts, t, sigcache))
845 			++num_srcs;
846 
847 		if (is_rename_target(diff, &opts, t, sigcache))
848 			++num_tgts;
849 
850 		if ((tgt->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0)
851 			num_rewrites++;
852 	}
853 
854 	/* if there are no candidate srcs or tgts, we're done */
855 	if (!num_srcs || !num_tgts)
856 		goto cleanup;
857 
858 	src2tgt = git__calloc(num_deltas, sizeof(diff_find_match));
859 	GIT_ERROR_CHECK_ALLOC(src2tgt);
860 	tgt2src = git__calloc(num_deltas, sizeof(diff_find_match));
861 	GIT_ERROR_CHECK_ALLOC(tgt2src);
862 
863 	if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) {
864 		tgt2src_copy = git__calloc(num_deltas, sizeof(diff_find_match));
865 		GIT_ERROR_CHECK_ALLOC(tgt2src_copy);
866 	}
867 
868 	/*
869 	 * Find best-fit matches for rename / copy candidates
870 	 */
871 
872 find_best_matches:
873 	tried_tgts = num_bumped = 0;
874 
875 	git_vector_foreach(&diff->deltas, t, tgt) {
876 		/* skip things that are not rename targets */
877 		if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
878 			continue;
879 
880 		tried_srcs = 0;
881 
882 		git_vector_foreach(&diff->deltas, s, src) {
883 			/* skip things that are not rename sources */
884 			if ((src->flags & GIT_DIFF_FLAG__IS_RENAME_SOURCE) == 0)
885 				continue;
886 
887 			/* calculate similarity for this pair and find best match */
888 			if (s == t)
889 				result = -1; /* don't measure self-similarity here */
890 			else if ((error = similarity_measure(
891 				&result, diff, &opts, sigcache, 2 * s, 2 * t + 1)) < 0)
892 				goto cleanup;
893 
894 			if (result < 0)
895 				continue;
896 			similarity = (uint16_t)result;
897 
898 			/* is this a better rename? */
899 			if (tgt2src[t].similarity < similarity &&
900 				src2tgt[s].similarity < similarity)
901 			{
902 				/* eject old mapping */
903 				if (src2tgt[s].similarity > 0) {
904 					tgt2src[src2tgt[s].idx].similarity = 0;
905 					num_bumped++;
906 				}
907 				if (tgt2src[t].similarity > 0) {
908 					src2tgt[tgt2src[t].idx].similarity = 0;
909 					num_bumped++;
910 				}
911 
912 				/* write new mapping */
913 				tgt2src[t].idx = s;
914 				tgt2src[t].similarity = similarity;
915 				src2tgt[s].idx = t;
916 				src2tgt[s].similarity = similarity;
917 			}
918 
919 			/* keep best absolute match for copies */
920 			if (tgt2src_copy != NULL &&
921 				tgt2src_copy[t].similarity < similarity)
922 			{
923 				tgt2src_copy[t].idx = s;
924 				tgt2src_copy[t].similarity = similarity;
925 			}
926 
927 			if (++tried_srcs >= num_srcs)
928 				break;
929 
930 			/* cap on maximum targets we'll examine (per "tgt" file) */
931 			if (tried_srcs > opts.rename_limit)
932 				break;
933 		}
934 
935 		if (++tried_tgts >= num_tgts)
936 			break;
937 	}
938 
939 	if (num_bumped > 0) /* try again if we bumped some items */
940 		goto find_best_matches;
941 
942 	/*
943 	 * Rewrite the diffs with renames / copies
944 	 */
945 
946 	git_vector_foreach(&diff->deltas, t, tgt) {
947 		/* skip things that are not rename targets */
948 		if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
949 			continue;
950 
951 		/* check if this delta was the target of a similarity */
952 		if (tgt2src[t].similarity)
953 			best_match = &tgt2src[t];
954 		else if (tgt2src_copy && tgt2src_copy[t].similarity)
955 			best_match = &tgt2src_copy[t];
956 		else
957 			continue;
958 
959 		s = best_match->idx;
960 		src = GIT_VECTOR_GET(&diff->deltas, s);
961 
962 		/* possible scenarios:
963 		 * 1. from DELETE to ADD/UNTRACK/IGNORE = RENAME
964 		 * 2. from DELETE to SPLIT/TYPECHANGE = RENAME + DELETE
965 		 * 3. from SPLIT/TYPECHANGE to ADD/UNTRACK/IGNORE = ADD + RENAME
966 		 * 4. from SPLIT/TYPECHANGE to SPLIT/TYPECHANGE = RENAME + SPLIT
967 		 * 5. from OTHER to ADD/UNTRACK/IGNORE = OTHER + COPY
968 		 */
969 
970 		if (src->status == GIT_DELTA_DELETED) {
971 
972 			if (delta_is_new_only(tgt)) {
973 
974 				if (best_match->similarity < opts.rename_threshold)
975 					continue;
976 
977 				delta_make_rename(tgt, src, best_match->similarity);
978 
979 				src->flags |= GIT_DIFF_FLAG__TO_DELETE;
980 				num_rewrites++;
981 			} else {
982 				GIT_ASSERT(delta_is_split(tgt));
983 
984 				if (best_match->similarity < opts.rename_from_rewrite_threshold)
985 					continue;
986 
987 				memcpy(&swap, &tgt->old_file, sizeof(swap));
988 
989 				delta_make_rename(tgt, src, best_match->similarity);
990 				num_rewrites--;
991 
992 				GIT_ASSERT(src->status == GIT_DELTA_DELETED);
993 				memcpy(&src->old_file, &swap, sizeof(src->old_file));
994 				memset(&src->new_file, 0, sizeof(src->new_file));
995 				src->new_file.path = src->old_file.path;
996 				src->new_file.flags |= GIT_DIFF_FLAG_VALID_ID;
997 
998 				num_updates++;
999 
1000 				if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) {
1001 					/* what used to be at src t is now at src s */
1002 					tgt2src[src2tgt[t].idx].idx = s;
1003 				}
1004 			}
1005 		}
1006 
1007 		else if (delta_is_split(src)) {
1008 
1009 			if (delta_is_new_only(tgt)) {
1010 
1011 				if (best_match->similarity < opts.rename_threshold)
1012 					continue;
1013 
1014 				delta_make_rename(tgt, src, best_match->similarity);
1015 
1016 				src->status = (diff->new_src == GIT_ITERATOR_WORKDIR) ?
1017 					GIT_DELTA_UNTRACKED : GIT_DELTA_ADDED;
1018 				src->nfiles = 1;
1019 				memset(&src->old_file, 0, sizeof(src->old_file));
1020 				src->old_file.path = src->new_file.path;
1021 				src->old_file.flags |= GIT_DIFF_FLAG_VALID_ID;
1022 
1023 				src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
1024 				num_rewrites--;
1025 
1026 				num_updates++;
1027 			} else {
1028 				GIT_ASSERT(delta_is_split(src));
1029 
1030 				if (best_match->similarity < opts.rename_from_rewrite_threshold)
1031 					continue;
1032 
1033 				memcpy(&swap, &tgt->old_file, sizeof(swap));
1034 
1035 				delta_make_rename(tgt, src, best_match->similarity);
1036 				num_rewrites--;
1037 				num_updates++;
1038 
1039 				memcpy(&src->old_file, &swap, sizeof(src->old_file));
1040 
1041 				/* if we've just swapped the new element into the correct
1042 				 * place, clear the SPLIT and RENAME_TARGET flags
1043 				 */
1044 				if (tgt2src[s].idx == t &&
1045 					tgt2src[s].similarity >
1046 					opts.rename_from_rewrite_threshold) {
1047 					src->status     = GIT_DELTA_RENAMED;
1048 					src->similarity = tgt2src[s].similarity;
1049 					tgt2src[s].similarity = 0;
1050 					src->flags &= ~(GIT_DIFF_FLAG__TO_SPLIT | GIT_DIFF_FLAG__IS_RENAME_TARGET);
1051 					num_rewrites--;
1052 				}
1053 				/* otherwise, if we just overwrote a source, update mapping */
1054 				else if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) {
1055 					/* what used to be at src t is now at src s */
1056 					tgt2src[src2tgt[t].idx].idx = s;
1057 				}
1058 
1059 				num_updates++;
1060 			}
1061 		}
1062 
1063 		else if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) {
1064 			if (tgt2src_copy[t].similarity < opts.copy_threshold)
1065 				continue;
1066 
1067 			/* always use best possible source for copy */
1068 			best_match = &tgt2src_copy[t];
1069 			src = GIT_VECTOR_GET(&diff->deltas, best_match->idx);
1070 
1071 			if (delta_is_split(tgt)) {
1072 				error = insert_delete_side_of_split(diff, &diff->deltas, tgt);
1073 				if (error < 0)
1074 					goto cleanup;
1075 				num_rewrites--;
1076 			}
1077 
1078 			if (!delta_is_split(tgt) && !delta_is_new_only(tgt))
1079 				continue;
1080 
1081 			tgt->status     = GIT_DELTA_COPIED;
1082 			tgt->similarity = best_match->similarity;
1083 			tgt->nfiles     = 2;
1084 			memcpy(&tgt->old_file, &src->old_file, sizeof(tgt->old_file));
1085 			tgt->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
1086 
1087 			num_updates++;
1088 		}
1089 	}
1090 
1091 	/*
1092 	 * Actually split and delete entries as needed
1093 	 */
1094 
1095 	if (num_rewrites > 0 || num_updates > 0)
1096 		error = apply_splits_and_deletes(
1097 			diff, diff->deltas.length - num_rewrites,
1098 			FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES) &&
1099 			!FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES_FOR_RENAMES_ONLY));
1100 
1101 cleanup:
1102 	git__free(tgt2src);
1103 	git__free(src2tgt);
1104 	git__free(tgt2src_copy);
1105 
1106 	if (sigcache) {
1107 		for (t = 0; t < num_deltas * 2; ++t) {
1108 			if (sigcache[t] != NULL)
1109 				opts.metric->free_signature(sigcache[t], opts.metric->payload);
1110 		}
1111 		git__free(sigcache);
1112 	}
1113 
1114 	if (!given_opts || !given_opts->metric)
1115 		git__free(opts.metric);
1116 
1117 	return error;
1118 }
1119 
1120 #undef FLAG_SET
1121