1 /*
2 * Copyright (C) the libgit2 contributors. All rights reserved.
3 *
4 * This file is part of libgit2, distributed under the GNU GPL v2 with
5 * a Linking Exception. For full terms see the included COPYING file.
6 */
7
8 #include "diff_tform.h"
9
10 #include "git2/config.h"
11 #include "git2/blob.h"
12 #include "git2/sys/hashsig.h"
13
14 #include "diff.h"
15 #include "diff_generate.h"
16 #include "path.h"
17 #include "futils.h"
18 #include "config.h"
19
git_diff__delta_dup(const git_diff_delta * d,git_pool * pool)20 git_diff_delta *git_diff__delta_dup(
21 const git_diff_delta *d, git_pool *pool)
22 {
23 git_diff_delta *delta = git__malloc(sizeof(git_diff_delta));
24 if (!delta)
25 return NULL;
26
27 memcpy(delta, d, sizeof(git_diff_delta));
28 GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags);
29
30 if (d->old_file.path != NULL) {
31 delta->old_file.path = git_pool_strdup(pool, d->old_file.path);
32 if (delta->old_file.path == NULL)
33 goto fail;
34 }
35
36 if (d->new_file.path != d->old_file.path && d->new_file.path != NULL) {
37 delta->new_file.path = git_pool_strdup(pool, d->new_file.path);
38 if (delta->new_file.path == NULL)
39 goto fail;
40 } else {
41 delta->new_file.path = delta->old_file.path;
42 }
43
44 return delta;
45
46 fail:
47 git__free(delta);
48 return NULL;
49 }
50
git_diff__merge_like_cgit(const git_diff_delta * a,const git_diff_delta * b,git_pool * pool)51 git_diff_delta *git_diff__merge_like_cgit(
52 const git_diff_delta *a,
53 const git_diff_delta *b,
54 git_pool *pool)
55 {
56 git_diff_delta *dup;
57
58 /* Emulate C git for merging two diffs (a la 'git diff <sha>').
59 *
60 * When C git does a diff between the work dir and a tree, it actually
61 * diffs with the index but uses the workdir contents. This emulates
62 * those choices so we can emulate the type of diff.
63 *
64 * We have three file descriptions here, let's call them:
65 * f1 = a->old_file
66 * f2 = a->new_file AND b->old_file
67 * f3 = b->new_file
68 */
69
70 /* If one of the diffs is a conflict, just dup it */
71 if (b->status == GIT_DELTA_CONFLICTED)
72 return git_diff__delta_dup(b, pool);
73 if (a->status == GIT_DELTA_CONFLICTED)
74 return git_diff__delta_dup(a, pool);
75
76 /* if f2 == f3 or f2 is deleted, then just dup the 'a' diff */
77 if (b->status == GIT_DELTA_UNMODIFIED || a->status == GIT_DELTA_DELETED)
78 return git_diff__delta_dup(a, pool);
79
80 /* otherwise, base this diff on the 'b' diff */
81 if ((dup = git_diff__delta_dup(b, pool)) == NULL)
82 return NULL;
83
84 /* If 'a' status is uninteresting, then we're done */
85 if (a->status == GIT_DELTA_UNMODIFIED ||
86 a->status == GIT_DELTA_UNTRACKED ||
87 a->status == GIT_DELTA_UNREADABLE)
88 return dup;
89
90 assert(b->status != GIT_DELTA_UNMODIFIED);
91
92 /* A cgit exception is that the diff of a file that is only in the
93 * index (i.e. not in HEAD nor workdir) is given as empty.
94 */
95 if (dup->status == GIT_DELTA_DELETED) {
96 if (a->status == GIT_DELTA_ADDED) {
97 dup->status = GIT_DELTA_UNMODIFIED;
98 dup->nfiles = 2;
99 }
100 /* else don't overwrite DELETE status */
101 } else {
102 dup->status = a->status;
103 dup->nfiles = a->nfiles;
104 }
105
106 git_oid_cpy(&dup->old_file.id, &a->old_file.id);
107 dup->old_file.mode = a->old_file.mode;
108 dup->old_file.size = a->old_file.size;
109 dup->old_file.flags = a->old_file.flags;
110
111 return dup;
112 }
113
git_diff__merge(git_diff * onto,const git_diff * from,git_diff__merge_cb cb)114 int git_diff__merge(
115 git_diff *onto, const git_diff *from, git_diff__merge_cb cb)
116 {
117 int error = 0;
118 git_pool onto_pool;
119 git_vector onto_new;
120 git_diff_delta *delta;
121 bool ignore_case, reversed;
122 unsigned int i, j;
123
124 assert(onto && from);
125
126 if (!from->deltas.length)
127 return 0;
128
129 ignore_case = ((onto->opts.flags & GIT_DIFF_IGNORE_CASE) != 0);
130 reversed = ((onto->opts.flags & GIT_DIFF_REVERSE) != 0);
131
132 if (ignore_case != ((from->opts.flags & GIT_DIFF_IGNORE_CASE) != 0) ||
133 reversed != ((from->opts.flags & GIT_DIFF_REVERSE) != 0)) {
134 git_error_set(GIT_ERROR_INVALID,
135 "attempt to merge diffs created with conflicting options");
136 return -1;
137 }
138
139 if (git_vector_init(&onto_new, onto->deltas.length, git_diff_delta__cmp) < 0 ||
140 git_pool_init(&onto_pool, 1) < 0)
141 return -1;
142
143 for (i = 0, j = 0; i < onto->deltas.length || j < from->deltas.length; ) {
144 git_diff_delta *o = GIT_VECTOR_GET(&onto->deltas, i);
145 const git_diff_delta *f = GIT_VECTOR_GET(&from->deltas, j);
146 int cmp = !f ? -1 : !o ? 1 :
147 STRCMP_CASESELECT(ignore_case, o->old_file.path, f->old_file.path);
148
149 if (cmp < 0) {
150 delta = git_diff__delta_dup(o, &onto_pool);
151 i++;
152 } else if (cmp > 0) {
153 delta = git_diff__delta_dup(f, &onto_pool);
154 j++;
155 } else {
156 const git_diff_delta *left = reversed ? f : o;
157 const git_diff_delta *right = reversed ? o : f;
158
159 delta = cb(left, right, &onto_pool);
160 i++;
161 j++;
162 }
163
164 /* the ignore rules for the target may not match the source
165 * or the result of a merged delta could be skippable...
166 */
167 if (delta && git_diff_delta__should_skip(&onto->opts, delta)) {
168 git__free(delta);
169 continue;
170 }
171
172 if ((error = !delta ? -1 : git_vector_insert(&onto_new, delta)) < 0)
173 break;
174 }
175
176 if (!error) {
177 git_vector_swap(&onto->deltas, &onto_new);
178 git_pool_swap(&onto->pool, &onto_pool);
179
180 if ((onto->opts.flags & GIT_DIFF_REVERSE) != 0)
181 onto->old_src = from->old_src;
182 else
183 onto->new_src = from->new_src;
184
185 /* prefix strings also come from old pool, so recreate those.*/
186 onto->opts.old_prefix =
187 git_pool_strdup_safe(&onto->pool, onto->opts.old_prefix);
188 onto->opts.new_prefix =
189 git_pool_strdup_safe(&onto->pool, onto->opts.new_prefix);
190 }
191
192 git_vector_free_deep(&onto_new);
193 git_pool_clear(&onto_pool);
194
195 return error;
196 }
197
git_diff_merge(git_diff * onto,const git_diff * from)198 int git_diff_merge(git_diff *onto, const git_diff *from)
199 {
200 return git_diff__merge(onto, from, git_diff__merge_like_cgit);
201 }
202
git_diff_find_similar__hashsig_for_file(void ** out,const git_diff_file * f,const char * path,void * p)203 int git_diff_find_similar__hashsig_for_file(
204 void **out, const git_diff_file *f, const char *path, void *p)
205 {
206 git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
207
208 GIT_UNUSED(f);
209 return git_hashsig_create_fromfile((git_hashsig **)out, path, opt);
210 }
211
git_diff_find_similar__hashsig_for_buf(void ** out,const git_diff_file * f,const char * buf,size_t len,void * p)212 int git_diff_find_similar__hashsig_for_buf(
213 void **out, const git_diff_file *f, const char *buf, size_t len, void *p)
214 {
215 git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p;
216
217 GIT_UNUSED(f);
218 return git_hashsig_create((git_hashsig **)out, buf, len, opt);
219 }
220
git_diff_find_similar__hashsig_free(void * sig,void * payload)221 void git_diff_find_similar__hashsig_free(void *sig, void *payload)
222 {
223 GIT_UNUSED(payload);
224 git_hashsig_free(sig);
225 }
226
git_diff_find_similar__calc_similarity(int * score,void * siga,void * sigb,void * payload)227 int git_diff_find_similar__calc_similarity(
228 int *score, void *siga, void *sigb, void *payload)
229 {
230 int error;
231
232 GIT_UNUSED(payload);
233 error = git_hashsig_compare(siga, sigb);
234 if (error < 0)
235 return error;
236
237 *score = error;
238 return 0;
239 }
240
241 #define DEFAULT_THRESHOLD 50
242 #define DEFAULT_BREAK_REWRITE_THRESHOLD 60
243 #define DEFAULT_RENAME_LIMIT 200
244
normalize_find_opts(git_diff * diff,git_diff_find_options * opts,const git_diff_find_options * given)245 static int normalize_find_opts(
246 git_diff *diff,
247 git_diff_find_options *opts,
248 const git_diff_find_options *given)
249 {
250 git_config *cfg = NULL;
251 git_hashsig_option_t hashsig_opts;
252
253 GIT_ERROR_CHECK_VERSION(given, GIT_DIFF_FIND_OPTIONS_VERSION, "git_diff_find_options");
254
255 if (diff->repo != NULL &&
256 git_repository_config__weakptr(&cfg, diff->repo) < 0)
257 return -1;
258
259 if (given)
260 memcpy(opts, given, sizeof(*opts));
261
262 if (!given ||
263 (given->flags & GIT_DIFF_FIND_ALL) == GIT_DIFF_FIND_BY_CONFIG)
264 {
265 if (cfg) {
266 char *rule =
267 git_config__get_string_force(cfg, "diff.renames", "true");
268 int boolval;
269
270 if (!git__parse_bool(&boolval, rule) && !boolval)
271 /* don't set FIND_RENAMES if bool value is false */;
272 else if (!strcasecmp(rule, "copies") || !strcasecmp(rule, "copy"))
273 opts->flags |= GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES;
274 else
275 opts->flags |= GIT_DIFF_FIND_RENAMES;
276
277 git__free(rule);
278 } else {
279 /* set default flag */
280 opts->flags |= GIT_DIFF_FIND_RENAMES;
281 }
282 }
283
284 /* some flags imply others */
285
286 if (opts->flags & GIT_DIFF_FIND_EXACT_MATCH_ONLY) {
287 /* if we are only looking for exact matches, then don't turn
288 * MODIFIED items into ADD/DELETE pairs because it's too picky
289 */
290 opts->flags &= ~(GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES);
291
292 /* similarly, don't look for self-rewrites to split */
293 opts->flags &= ~GIT_DIFF_FIND_RENAMES_FROM_REWRITES;
294 }
295
296 if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES)
297 opts->flags |= GIT_DIFF_FIND_RENAMES;
298
299 if (opts->flags & GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)
300 opts->flags |= GIT_DIFF_FIND_COPIES;
301
302 if (opts->flags & GIT_DIFF_BREAK_REWRITES)
303 opts->flags |= GIT_DIFF_FIND_REWRITES;
304
305 #define USE_DEFAULT(X) ((X) == 0 || (X) > 100)
306
307 if (USE_DEFAULT(opts->rename_threshold))
308 opts->rename_threshold = DEFAULT_THRESHOLD;
309
310 if (USE_DEFAULT(opts->rename_from_rewrite_threshold))
311 opts->rename_from_rewrite_threshold = DEFAULT_THRESHOLD;
312
313 if (USE_DEFAULT(opts->copy_threshold))
314 opts->copy_threshold = DEFAULT_THRESHOLD;
315
316 if (USE_DEFAULT(opts->break_rewrite_threshold))
317 opts->break_rewrite_threshold = DEFAULT_BREAK_REWRITE_THRESHOLD;
318
319 #undef USE_DEFAULT
320
321 if (!opts->rename_limit) {
322 if (cfg) {
323 opts->rename_limit = git_config__get_int_force(
324 cfg, "diff.renamelimit", DEFAULT_RENAME_LIMIT);
325 }
326
327 if (opts->rename_limit <= 0)
328 opts->rename_limit = DEFAULT_RENAME_LIMIT;
329 }
330
331 /* assign the internal metric with whitespace flag as payload */
332 if (!opts->metric) {
333 opts->metric = git__malloc(sizeof(git_diff_similarity_metric));
334 GIT_ERROR_CHECK_ALLOC(opts->metric);
335
336 opts->metric->file_signature = git_diff_find_similar__hashsig_for_file;
337 opts->metric->buffer_signature = git_diff_find_similar__hashsig_for_buf;
338 opts->metric->free_signature = git_diff_find_similar__hashsig_free;
339 opts->metric->similarity = git_diff_find_similar__calc_similarity;
340
341 if (opts->flags & GIT_DIFF_FIND_IGNORE_WHITESPACE)
342 hashsig_opts = GIT_HASHSIG_IGNORE_WHITESPACE;
343 else if (opts->flags & GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE)
344 hashsig_opts = GIT_HASHSIG_NORMAL;
345 else
346 hashsig_opts = GIT_HASHSIG_SMART_WHITESPACE;
347 hashsig_opts |= GIT_HASHSIG_ALLOW_SMALL_FILES;
348 opts->metric->payload = (void *)hashsig_opts;
349 }
350
351 return 0;
352 }
353
insert_delete_side_of_split(git_diff * diff,git_vector * onto,const git_diff_delta * delta)354 static int insert_delete_side_of_split(
355 git_diff *diff, git_vector *onto, const git_diff_delta *delta)
356 {
357 /* make new record for DELETED side of split */
358 git_diff_delta *deleted = git_diff__delta_dup(delta, &diff->pool);
359 GIT_ERROR_CHECK_ALLOC(deleted);
360
361 deleted->status = GIT_DELTA_DELETED;
362 deleted->nfiles = 1;
363 memset(&deleted->new_file, 0, sizeof(deleted->new_file));
364 deleted->new_file.path = deleted->old_file.path;
365 deleted->new_file.flags |= GIT_DIFF_FLAG_VALID_ID;
366
367 return git_vector_insert(onto, deleted);
368 }
369
apply_splits_and_deletes(git_diff * diff,size_t expected_size,bool actually_split)370 static int apply_splits_and_deletes(
371 git_diff *diff, size_t expected_size, bool actually_split)
372 {
373 git_vector onto = GIT_VECTOR_INIT;
374 size_t i;
375 git_diff_delta *delta;
376
377 if (git_vector_init(&onto, expected_size, git_diff_delta__cmp) < 0)
378 return -1;
379
380 /* build new delta list without TO_DELETE and splitting TO_SPLIT */
381 git_vector_foreach(&diff->deltas, i, delta) {
382 if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0)
383 continue;
384
385 if ((delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0 && actually_split) {
386 delta->similarity = 0;
387
388 if (insert_delete_side_of_split(diff, &onto, delta) < 0)
389 goto on_error;
390
391 if (diff->new_src == GIT_ITERATOR_WORKDIR)
392 delta->status = GIT_DELTA_UNTRACKED;
393 else
394 delta->status = GIT_DELTA_ADDED;
395 delta->nfiles = 1;
396 memset(&delta->old_file, 0, sizeof(delta->old_file));
397 delta->old_file.path = delta->new_file.path;
398 delta->old_file.flags |= GIT_DIFF_FLAG_VALID_ID;
399 }
400
401 /* clean up delta before inserting into new list */
402 GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags);
403
404 if (delta->status != GIT_DELTA_COPIED &&
405 delta->status != GIT_DELTA_RENAMED &&
406 (delta->status != GIT_DELTA_MODIFIED || actually_split))
407 delta->similarity = 0;
408
409 /* insert into new list */
410 if (git_vector_insert(&onto, delta) < 0)
411 goto on_error;
412 }
413
414 /* cannot return an error past this point */
415
416 /* free deltas from old list that didn't make it to the new one */
417 git_vector_foreach(&diff->deltas, i, delta) {
418 if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0)
419 git__free(delta);
420 }
421
422 /* swap new delta list into place */
423 git_vector_swap(&diff->deltas, &onto);
424 git_vector_free(&onto);
425 git_vector_sort(&diff->deltas);
426
427 return 0;
428
429 on_error:
430 git_vector_free_deep(&onto);
431
432 return -1;
433 }
434
similarity_get_file(git_diff * diff,size_t idx)435 GIT_INLINE(git_diff_file *) similarity_get_file(git_diff *diff, size_t idx)
436 {
437 git_diff_delta *delta = git_vector_get(&diff->deltas, idx / 2);
438 return (idx & 1) ? &delta->new_file : &delta->old_file;
439 }
440
441 typedef struct {
442 size_t idx;
443 git_iterator_t src;
444 git_repository *repo;
445 git_diff_file *file;
446 git_buf data;
447 git_odb_object *odb_obj;
448 git_blob *blob;
449 } similarity_info;
450
similarity_init(similarity_info * info,git_diff * diff,size_t file_idx)451 static int similarity_init(
452 similarity_info *info, git_diff *diff, size_t file_idx)
453 {
454 info->idx = file_idx;
455 info->src = (file_idx & 1) ? diff->new_src : diff->old_src;
456 info->repo = diff->repo;
457 info->file = similarity_get_file(diff, file_idx);
458 info->odb_obj = NULL;
459 info->blob = NULL;
460 git_buf_init(&info->data, 0);
461
462 if (info->file->size > 0 || info->src == GIT_ITERATOR_WORKDIR)
463 return 0;
464
465 return git_diff_file__resolve_zero_size(
466 info->file, &info->odb_obj, info->repo);
467 }
468
similarity_sig(similarity_info * info,const git_diff_find_options * opts,void ** cache)469 static int similarity_sig(
470 similarity_info *info,
471 const git_diff_find_options *opts,
472 void **cache)
473 {
474 int error = 0;
475 git_diff_file *file = info->file;
476
477 if (info->src == GIT_ITERATOR_WORKDIR) {
478 if ((error = git_buf_joinpath(
479 &info->data, git_repository_workdir(info->repo), file->path)) < 0)
480 return error;
481
482 /* if path is not a regular file, just skip this item */
483 if (!git_path_isfile(info->data.ptr))
484 return 0;
485
486 /* TODO: apply wd-to-odb filters to file data if necessary */
487
488 error = opts->metric->file_signature(
489 &cache[info->idx], info->file,
490 info->data.ptr, opts->metric->payload);
491 } else {
492 /* if we didn't initially know the size, we might have an odb_obj
493 * around from earlier, so convert that, otherwise load the blob now
494 */
495 if (info->odb_obj != NULL)
496 error = git_object__from_odb_object(
497 (git_object **)&info->blob, info->repo,
498 info->odb_obj, GIT_OBJECT_BLOB);
499 else
500 error = git_blob_lookup(&info->blob, info->repo, &file->id);
501
502 if (error < 0) {
503 /* if lookup fails, just skip this item in similarity calc */
504 git_error_clear();
505 } else {
506 size_t sz;
507
508 /* index size may not be actual blob size if filtered */
509 if (file->size != git_blob_rawsize(info->blob))
510 file->size = git_blob_rawsize(info->blob);
511
512 sz = git__is_sizet(file->size) ? (size_t)file->size : (size_t)-1;
513
514 error = opts->metric->buffer_signature(
515 &cache[info->idx], info->file,
516 git_blob_rawcontent(info->blob), sz, opts->metric->payload);
517 }
518 }
519
520 return error;
521 }
522
similarity_unload(similarity_info * info)523 static void similarity_unload(similarity_info *info)
524 {
525 if (info->odb_obj)
526 git_odb_object_free(info->odb_obj);
527
528 if (info->blob)
529 git_blob_free(info->blob);
530 else
531 git_buf_dispose(&info->data);
532 }
533
534 #define FLAG_SET(opts,flag_name) (((opts)->flags & flag_name) != 0)
535
536 /* - score < 0 means files cannot be compared
537 * - score >= 100 means files are exact match
538 * - score == 0 means files are completely different
539 */
similarity_measure(int * score,git_diff * diff,const git_diff_find_options * opts,void ** cache,size_t a_idx,size_t b_idx)540 static int similarity_measure(
541 int *score,
542 git_diff *diff,
543 const git_diff_find_options *opts,
544 void **cache,
545 size_t a_idx,
546 size_t b_idx)
547 {
548 git_diff_file *a_file = similarity_get_file(diff, a_idx);
549 git_diff_file *b_file = similarity_get_file(diff, b_idx);
550 bool exact_match = FLAG_SET(opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY);
551 int error = 0;
552 similarity_info a_info, b_info;
553
554 *score = -1;
555
556 /* don't try to compare things that aren't files */
557 if (!GIT_MODE_ISBLOB(a_file->mode) || !GIT_MODE_ISBLOB(b_file->mode))
558 return 0;
559
560 /* if exact match is requested, force calculation of missing OIDs now */
561 if (exact_match) {
562 if (git_oid_is_zero(&a_file->id) &&
563 diff->old_src == GIT_ITERATOR_WORKDIR &&
564 !git_diff__oid_for_file(&a_file->id,
565 diff, a_file->path, a_file->mode, a_file->size))
566 a_file->flags |= GIT_DIFF_FLAG_VALID_ID;
567
568 if (git_oid_is_zero(&b_file->id) &&
569 diff->new_src == GIT_ITERATOR_WORKDIR &&
570 !git_diff__oid_for_file(&b_file->id,
571 diff, b_file->path, b_file->mode, b_file->size))
572 b_file->flags |= GIT_DIFF_FLAG_VALID_ID;
573 }
574
575 /* check OID match as a quick test */
576 if (git_oid__cmp(&a_file->id, &b_file->id) == 0) {
577 *score = 100;
578 return 0;
579 }
580
581 /* don't calculate signatures if we are doing exact match */
582 if (exact_match) {
583 *score = 0;
584 return 0;
585 }
586
587 memset(&a_info, 0, sizeof(a_info));
588 memset(&b_info, 0, sizeof(b_info));
589
590 /* set up similarity data (will try to update missing file sizes) */
591 if (!cache[a_idx] && (error = similarity_init(&a_info, diff, a_idx)) < 0)
592 return error;
593 if (!cache[b_idx] && (error = similarity_init(&b_info, diff, b_idx)) < 0)
594 goto cleanup;
595
596 /* check if file sizes are nowhere near each other */
597 if (a_file->size > 127 &&
598 b_file->size > 127 &&
599 (a_file->size > (b_file->size << 3) ||
600 b_file->size > (a_file->size << 3)))
601 goto cleanup;
602
603 /* update signature cache if needed */
604 if (!cache[a_idx]) {
605 if ((error = similarity_sig(&a_info, opts, cache)) < 0)
606 goto cleanup;
607 }
608 if (!cache[b_idx]) {
609 if ((error = similarity_sig(&b_info, opts, cache)) < 0)
610 goto cleanup;
611 }
612
613 /* calculate similarity provided that the metric choose to process
614 * both the a and b files (some may not if file is too big, etc).
615 */
616 if (cache[a_idx] && cache[b_idx])
617 error = opts->metric->similarity(
618 score, cache[a_idx], cache[b_idx], opts->metric->payload);
619
620 cleanup:
621 similarity_unload(&a_info);
622 similarity_unload(&b_info);
623
624 return error;
625 }
626
calc_self_similarity(git_diff * diff,const git_diff_find_options * opts,size_t delta_idx,void ** cache)627 static int calc_self_similarity(
628 git_diff *diff,
629 const git_diff_find_options *opts,
630 size_t delta_idx,
631 void **cache)
632 {
633 int error, similarity = -1;
634 git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
635
636 if ((delta->flags & GIT_DIFF_FLAG__HAS_SELF_SIMILARITY) != 0)
637 return 0;
638
639 error = similarity_measure(
640 &similarity, diff, opts, cache, 2 * delta_idx, 2 * delta_idx + 1);
641 if (error < 0)
642 return error;
643
644 if (similarity >= 0) {
645 delta->similarity = (uint16_t)similarity;
646 delta->flags |= GIT_DIFF_FLAG__HAS_SELF_SIMILARITY;
647 }
648
649 return 0;
650 }
651
is_rename_target(git_diff * diff,const git_diff_find_options * opts,size_t delta_idx,void ** cache)652 static bool is_rename_target(
653 git_diff *diff,
654 const git_diff_find_options *opts,
655 size_t delta_idx,
656 void **cache)
657 {
658 git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
659
660 /* skip things that aren't plain blobs */
661 if (!GIT_MODE_ISBLOB(delta->new_file.mode))
662 return false;
663
664 /* only consider ADDED, RENAMED, COPIED, and split MODIFIED as
665 * targets; maybe include UNTRACKED if requested.
666 */
667 switch (delta->status) {
668 case GIT_DELTA_UNMODIFIED:
669 case GIT_DELTA_DELETED:
670 case GIT_DELTA_IGNORED:
671 case GIT_DELTA_CONFLICTED:
672 return false;
673
674 case GIT_DELTA_MODIFIED:
675 if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) &&
676 !FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES))
677 return false;
678
679 if (calc_self_similarity(diff, opts, delta_idx, cache) < 0)
680 return false;
681
682 if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) &&
683 delta->similarity < opts->break_rewrite_threshold) {
684 delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
685 break;
686 }
687 if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
688 delta->similarity < opts->rename_from_rewrite_threshold) {
689 delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
690 break;
691 }
692
693 return false;
694
695 case GIT_DELTA_UNTRACKED:
696 if (!FLAG_SET(opts, GIT_DIFF_FIND_FOR_UNTRACKED))
697 return false;
698 break;
699
700 default: /* all other status values should be checked */
701 break;
702 }
703
704 delta->flags |= GIT_DIFF_FLAG__IS_RENAME_TARGET;
705 return true;
706 }
707
is_rename_source(git_diff * diff,const git_diff_find_options * opts,size_t delta_idx,void ** cache)708 static bool is_rename_source(
709 git_diff *diff,
710 const git_diff_find_options *opts,
711 size_t delta_idx,
712 void **cache)
713 {
714 git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx);
715
716 /* skip things that aren't blobs */
717 if (!GIT_MODE_ISBLOB(delta->old_file.mode))
718 return false;
719
720 switch (delta->status) {
721 case GIT_DELTA_ADDED:
722 case GIT_DELTA_UNTRACKED:
723 case GIT_DELTA_UNREADABLE:
724 case GIT_DELTA_IGNORED:
725 case GIT_DELTA_CONFLICTED:
726 return false;
727
728 case GIT_DELTA_DELETED:
729 case GIT_DELTA_TYPECHANGE:
730 break;
731
732 case GIT_DELTA_UNMODIFIED:
733 if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED))
734 return false;
735 if (FLAG_SET(opts, GIT_DIFF_FIND_REMOVE_UNMODIFIED))
736 delta->flags |= GIT_DIFF_FLAG__TO_DELETE;
737 break;
738
739 default: /* MODIFIED, RENAMED, COPIED */
740 /* if we're finding copies, this could be a source */
741 if (FLAG_SET(opts, GIT_DIFF_FIND_COPIES))
742 break;
743
744 /* otherwise, this is only a source if we can split it */
745 if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) &&
746 !FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES))
747 return false;
748
749 if (calc_self_similarity(diff, opts, delta_idx, cache) < 0)
750 return false;
751
752 if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) &&
753 delta->similarity < opts->break_rewrite_threshold) {
754 delta->flags |= GIT_DIFF_FLAG__TO_SPLIT;
755 break;
756 }
757
758 if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) &&
759 delta->similarity < opts->rename_from_rewrite_threshold)
760 break;
761
762 return false;
763 }
764
765 delta->flags |= GIT_DIFF_FLAG__IS_RENAME_SOURCE;
766 return true;
767 }
768
delta_is_split(git_diff_delta * delta)769 GIT_INLINE(bool) delta_is_split(git_diff_delta *delta)
770 {
771 return (delta->status == GIT_DELTA_TYPECHANGE ||
772 (delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0);
773 }
774
delta_is_new_only(git_diff_delta * delta)775 GIT_INLINE(bool) delta_is_new_only(git_diff_delta *delta)
776 {
777 return (delta->status == GIT_DELTA_ADDED ||
778 delta->status == GIT_DELTA_UNTRACKED ||
779 delta->status == GIT_DELTA_UNREADABLE ||
780 delta->status == GIT_DELTA_IGNORED);
781 }
782
delta_make_rename(git_diff_delta * to,const git_diff_delta * from,uint16_t similarity)783 GIT_INLINE(void) delta_make_rename(
784 git_diff_delta *to, const git_diff_delta *from, uint16_t similarity)
785 {
786 to->status = GIT_DELTA_RENAMED;
787 to->similarity = similarity;
788 to->nfiles = 2;
789 memcpy(&to->old_file, &from->old_file, sizeof(to->old_file));
790 to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
791 }
792
793 typedef struct {
794 size_t idx;
795 uint16_t similarity;
796 } diff_find_match;
797
git_diff_find_similar(git_diff * diff,const git_diff_find_options * given_opts)798 int git_diff_find_similar(
799 git_diff *diff,
800 const git_diff_find_options *given_opts)
801 {
802 size_t s, t;
803 int error = 0, result;
804 uint16_t similarity;
805 git_diff_delta *src, *tgt;
806 git_diff_find_options opts = GIT_DIFF_FIND_OPTIONS_INIT;
807 size_t num_deltas, num_srcs = 0, num_tgts = 0;
808 size_t tried_srcs = 0, tried_tgts = 0;
809 size_t num_rewrites = 0, num_updates = 0, num_bumped = 0;
810 size_t sigcache_size;
811 void **sigcache = NULL; /* cache of similarity metric file signatures */
812 diff_find_match *tgt2src = NULL;
813 diff_find_match *src2tgt = NULL;
814 diff_find_match *tgt2src_copy = NULL;
815 diff_find_match *best_match;
816 git_diff_file swap;
817
818 assert(diff);
819
820 if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0)
821 return error;
822
823 num_deltas = diff->deltas.length;
824
825 /* TODO: maybe abort if deltas.length > rename_limit ??? */
826 if (!num_deltas || !git__is_uint32(num_deltas))
827 goto cleanup;
828
829 /* No flags set; nothing to do */
830 if ((opts.flags & GIT_DIFF_FIND_ALL) == 0)
831 goto cleanup;
832
833 GIT_ERROR_CHECK_ALLOC_MULTIPLY(&sigcache_size, num_deltas, 2);
834 sigcache = git__calloc(sigcache_size, sizeof(void *));
835 GIT_ERROR_CHECK_ALLOC(sigcache);
836
837 /* Label rename sources and targets
838 *
839 * This will also set self-similarity scores for MODIFIED files and
840 * mark them for splitting if break-rewrites is enabled
841 */
842 git_vector_foreach(&diff->deltas, t, tgt) {
843 if (is_rename_source(diff, &opts, t, sigcache))
844 ++num_srcs;
845
846 if (is_rename_target(diff, &opts, t, sigcache))
847 ++num_tgts;
848
849 if ((tgt->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0)
850 num_rewrites++;
851 }
852
853 /* if there are no candidate srcs or tgts, we're done */
854 if (!num_srcs || !num_tgts)
855 goto cleanup;
856
857 src2tgt = git__calloc(num_deltas, sizeof(diff_find_match));
858 GIT_ERROR_CHECK_ALLOC(src2tgt);
859 tgt2src = git__calloc(num_deltas, sizeof(diff_find_match));
860 GIT_ERROR_CHECK_ALLOC(tgt2src);
861
862 if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) {
863 tgt2src_copy = git__calloc(num_deltas, sizeof(diff_find_match));
864 GIT_ERROR_CHECK_ALLOC(tgt2src_copy);
865 }
866
867 /*
868 * Find best-fit matches for rename / copy candidates
869 */
870
871 find_best_matches:
872 tried_tgts = num_bumped = 0;
873
874 git_vector_foreach(&diff->deltas, t, tgt) {
875 /* skip things that are not rename targets */
876 if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
877 continue;
878
879 tried_srcs = 0;
880
881 git_vector_foreach(&diff->deltas, s, src) {
882 /* skip things that are not rename sources */
883 if ((src->flags & GIT_DIFF_FLAG__IS_RENAME_SOURCE) == 0)
884 continue;
885
886 /* calculate similarity for this pair and find best match */
887 if (s == t)
888 result = -1; /* don't measure self-similarity here */
889 else if ((error = similarity_measure(
890 &result, diff, &opts, sigcache, 2 * s, 2 * t + 1)) < 0)
891 goto cleanup;
892
893 if (result < 0)
894 continue;
895 similarity = (uint16_t)result;
896
897 /* is this a better rename? */
898 if (tgt2src[t].similarity < similarity &&
899 src2tgt[s].similarity < similarity)
900 {
901 /* eject old mapping */
902 if (src2tgt[s].similarity > 0) {
903 tgt2src[src2tgt[s].idx].similarity = 0;
904 num_bumped++;
905 }
906 if (tgt2src[t].similarity > 0) {
907 src2tgt[tgt2src[t].idx].similarity = 0;
908 num_bumped++;
909 }
910
911 /* write new mapping */
912 tgt2src[t].idx = s;
913 tgt2src[t].similarity = similarity;
914 src2tgt[s].idx = t;
915 src2tgt[s].similarity = similarity;
916 }
917
918 /* keep best absolute match for copies */
919 if (tgt2src_copy != NULL &&
920 tgt2src_copy[t].similarity < similarity)
921 {
922 tgt2src_copy[t].idx = s;
923 tgt2src_copy[t].similarity = similarity;
924 }
925
926 if (++tried_srcs >= num_srcs)
927 break;
928
929 /* cap on maximum targets we'll examine (per "tgt" file) */
930 if (tried_srcs > opts.rename_limit)
931 break;
932 }
933
934 if (++tried_tgts >= num_tgts)
935 break;
936 }
937
938 if (num_bumped > 0) /* try again if we bumped some items */
939 goto find_best_matches;
940
941 /*
942 * Rewrite the diffs with renames / copies
943 */
944
945 git_vector_foreach(&diff->deltas, t, tgt) {
946 /* skip things that are not rename targets */
947 if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0)
948 continue;
949
950 /* check if this delta was the target of a similarity */
951 if (tgt2src[t].similarity)
952 best_match = &tgt2src[t];
953 else if (tgt2src_copy && tgt2src_copy[t].similarity)
954 best_match = &tgt2src_copy[t];
955 else
956 continue;
957
958 s = best_match->idx;
959 src = GIT_VECTOR_GET(&diff->deltas, s);
960
961 /* possible scenarios:
962 * 1. from DELETE to ADD/UNTRACK/IGNORE = RENAME
963 * 2. from DELETE to SPLIT/TYPECHANGE = RENAME + DELETE
964 * 3. from SPLIT/TYPECHANGE to ADD/UNTRACK/IGNORE = ADD + RENAME
965 * 4. from SPLIT/TYPECHANGE to SPLIT/TYPECHANGE = RENAME + SPLIT
966 * 5. from OTHER to ADD/UNTRACK/IGNORE = OTHER + COPY
967 */
968
969 if (src->status == GIT_DELTA_DELETED) {
970
971 if (delta_is_new_only(tgt)) {
972
973 if (best_match->similarity < opts.rename_threshold)
974 continue;
975
976 delta_make_rename(tgt, src, best_match->similarity);
977
978 src->flags |= GIT_DIFF_FLAG__TO_DELETE;
979 num_rewrites++;
980 } else {
981 assert(delta_is_split(tgt));
982
983 if (best_match->similarity < opts.rename_from_rewrite_threshold)
984 continue;
985
986 memcpy(&swap, &tgt->old_file, sizeof(swap));
987
988 delta_make_rename(tgt, src, best_match->similarity);
989 num_rewrites--;
990
991 assert(src->status == GIT_DELTA_DELETED);
992 memcpy(&src->old_file, &swap, sizeof(src->old_file));
993 memset(&src->new_file, 0, sizeof(src->new_file));
994 src->new_file.path = src->old_file.path;
995 src->new_file.flags |= GIT_DIFF_FLAG_VALID_ID;
996
997 num_updates++;
998
999 if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) {
1000 /* what used to be at src t is now at src s */
1001 tgt2src[src2tgt[t].idx].idx = s;
1002 }
1003 }
1004 }
1005
1006 else if (delta_is_split(src)) {
1007
1008 if (delta_is_new_only(tgt)) {
1009
1010 if (best_match->similarity < opts.rename_threshold)
1011 continue;
1012
1013 delta_make_rename(tgt, src, best_match->similarity);
1014
1015 src->status = (diff->new_src == GIT_ITERATOR_WORKDIR) ?
1016 GIT_DELTA_UNTRACKED : GIT_DELTA_ADDED;
1017 src->nfiles = 1;
1018 memset(&src->old_file, 0, sizeof(src->old_file));
1019 src->old_file.path = src->new_file.path;
1020 src->old_file.flags |= GIT_DIFF_FLAG_VALID_ID;
1021
1022 src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
1023 num_rewrites--;
1024
1025 num_updates++;
1026 } else {
1027 assert(delta_is_split(src));
1028
1029 if (best_match->similarity < opts.rename_from_rewrite_threshold)
1030 continue;
1031
1032 memcpy(&swap, &tgt->old_file, sizeof(swap));
1033
1034 delta_make_rename(tgt, src, best_match->similarity);
1035 num_rewrites--;
1036 num_updates++;
1037
1038 memcpy(&src->old_file, &swap, sizeof(src->old_file));
1039
1040 /* if we've just swapped the new element into the correct
1041 * place, clear the SPLIT flag
1042 */
1043 if (tgt2src[s].idx == t &&
1044 tgt2src[s].similarity >
1045 opts.rename_from_rewrite_threshold) {
1046 src->status = GIT_DELTA_RENAMED;
1047 src->similarity = tgt2src[s].similarity;
1048 tgt2src[s].similarity = 0;
1049 src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
1050 num_rewrites--;
1051 }
1052 /* otherwise, if we just overwrote a source, update mapping */
1053 else if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) {
1054 /* what used to be at src t is now at src s */
1055 tgt2src[src2tgt[t].idx].idx = s;
1056 }
1057
1058 num_updates++;
1059 }
1060 }
1061
1062 else if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) {
1063 if (tgt2src_copy[t].similarity < opts.copy_threshold)
1064 continue;
1065
1066 /* always use best possible source for copy */
1067 best_match = &tgt2src_copy[t];
1068 src = GIT_VECTOR_GET(&diff->deltas, best_match->idx);
1069
1070 if (delta_is_split(tgt)) {
1071 error = insert_delete_side_of_split(diff, &diff->deltas, tgt);
1072 if (error < 0)
1073 goto cleanup;
1074 num_rewrites--;
1075 }
1076
1077 if (!delta_is_split(tgt) && !delta_is_new_only(tgt))
1078 continue;
1079
1080 tgt->status = GIT_DELTA_COPIED;
1081 tgt->similarity = best_match->similarity;
1082 tgt->nfiles = 2;
1083 memcpy(&tgt->old_file, &src->old_file, sizeof(tgt->old_file));
1084 tgt->flags &= ~GIT_DIFF_FLAG__TO_SPLIT;
1085
1086 num_updates++;
1087 }
1088 }
1089
1090 /*
1091 * Actually split and delete entries as needed
1092 */
1093
1094 if (num_rewrites > 0 || num_updates > 0)
1095 error = apply_splits_and_deletes(
1096 diff, diff->deltas.length - num_rewrites,
1097 FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES) &&
1098 !FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES_FOR_RENAMES_ONLY));
1099
1100 cleanup:
1101 git__free(tgt2src);
1102 git__free(src2tgt);
1103 git__free(tgt2src_copy);
1104
1105 if (sigcache) {
1106 for (t = 0; t < num_deltas * 2; ++t) {
1107 if (sigcache[t] != NULL)
1108 opts.metric->free_signature(sigcache[t], opts.metric->payload);
1109 }
1110 git__free(sigcache);
1111 }
1112
1113 if (!given_opts || !given_opts->metric)
1114 git__free(opts.metric);
1115
1116 return error;
1117 }
1118
1119 #undef FLAG_SET
1120