Merge branch 'ab/config-based-hooks-base' into seen
[git] / diffcore-rename.c
1 /*
2  *
3  * Copyright (C) 2005 Junio C Hamano
4  */
5 #include "cache.h"
6 #include "diff.h"
7 #include "diffcore.h"
8 #include "object-store.h"
9 #include "hashmap.h"
10 #include "progress.h"
11 #include "promisor-remote.h"
12 #include "strmap.h"
13
14 /* Table of rename/copy destinations */
15
16 static struct diff_rename_dst {
17         struct diff_filepair *p;
18         struct diff_filespec *filespec_to_free;
19         int is_rename; /* false -> just a create; true -> rename or copy */
20 } *rename_dst;
21 static int rename_dst_nr, rename_dst_alloc;
22 /* Mapping from break source pathname to break destination index */
23 static struct strintmap *break_idx = NULL;
24
25 static struct diff_rename_dst *locate_rename_dst(struct diff_filepair *p)
26 {
27         /* Lookup by p->ONE->path */
28         int idx = break_idx ? strintmap_get(break_idx, p->one->path) : -1;
29         return (idx == -1) ? NULL : &rename_dst[idx];
30 }
31
32 /*
33  * Returns 0 on success, -1 if we found a duplicate.
34  */
35 static int add_rename_dst(struct diff_filepair *p)
36 {
37         ALLOC_GROW(rename_dst, rename_dst_nr + 1, rename_dst_alloc);
38         rename_dst[rename_dst_nr].p = p;
39         rename_dst[rename_dst_nr].filespec_to_free = NULL;
40         rename_dst[rename_dst_nr].is_rename = 0;
41         rename_dst_nr++;
42         return 0;
43 }
44
45 /* Table of rename/copy src files */
46 static struct diff_rename_src {
47         struct diff_filepair *p;
48         unsigned short score; /* to remember the break score */
49 } *rename_src;
50 static int rename_src_nr, rename_src_alloc;
51
52 static void register_rename_src(struct diff_filepair *p)
53 {
54         if (p->broken_pair) {
55                 if (!break_idx) {
56                         break_idx = xmalloc(sizeof(*break_idx));
57                         strintmap_init_with_options(break_idx, -1, NULL, 0);
58                 }
59                 strintmap_set(break_idx, p->one->path, rename_dst_nr);
60         }
61
62         ALLOC_GROW(rename_src, rename_src_nr + 1, rename_src_alloc);
63         rename_src[rename_src_nr].p = p;
64         rename_src[rename_src_nr].score = p->score;
65         rename_src_nr++;
66 }
67
68 static int basename_same(struct diff_filespec *src, struct diff_filespec *dst)
69 {
70         int src_len = strlen(src->path), dst_len = strlen(dst->path);
71         while (src_len && dst_len) {
72                 char c1 = src->path[--src_len];
73                 char c2 = dst->path[--dst_len];
74                 if (c1 != c2)
75                         return 0;
76                 if (c1 == '/')
77                         return 1;
78         }
79         return (!src_len || src->path[src_len - 1] == '/') &&
80                 (!dst_len || dst->path[dst_len - 1] == '/');
81 }
82
83 struct diff_score {
84         int src; /* index in rename_src */
85         int dst; /* index in rename_dst */
86         unsigned short score;
87         short name_score;
88 };
89
90 struct inexact_prefetch_options {
91         struct repository *repo;
92         int skip_unmodified;
93 };
94 static void inexact_prefetch(void *prefetch_options)
95 {
96         struct inexact_prefetch_options *options = prefetch_options;
97         int i;
98         struct oid_array to_fetch = OID_ARRAY_INIT;
99
100         for (i = 0; i < rename_dst_nr; i++) {
101                 if (rename_dst[i].p->renamed_pair)
102                         /*
103                          * The loop in diffcore_rename() will not need these
104                          * blobs, so skip prefetching.
105                          */
106                         continue; /* already found exact match */
107                 diff_add_if_missing(options->repo, &to_fetch,
108                                     rename_dst[i].p->two);
109         }
110         for (i = 0; i < rename_src_nr; i++) {
111                 if (options->skip_unmodified &&
112                     diff_unmodified_pair(rename_src[i].p))
113                         /*
114                          * The loop in diffcore_rename() will not need these
115                          * blobs, so skip prefetching.
116                          */
117                         continue;
118                 diff_add_if_missing(options->repo, &to_fetch,
119                                     rename_src[i].p->one);
120         }
121         promisor_remote_get_direct(options->repo, to_fetch.oid, to_fetch.nr);
122         oid_array_clear(&to_fetch);
123 }
124
125 static int estimate_similarity(struct repository *r,
126                                struct diff_filespec *src,
127                                struct diff_filespec *dst,
128                                int minimum_score,
129                                struct diff_populate_filespec_options *dpf_opt)
130 {
131         /* src points at a file that existed in the original tree (or
132          * optionally a file in the destination tree) and dst points
133          * at a newly created file.  They may be quite similar, in which
134          * case we want to say src is renamed to dst or src is copied into
135          * dst, and then some edit has been applied to dst.
136          *
137          * Compare them and return how similar they are, representing
138          * the score as an integer between 0 and MAX_SCORE.
139          *
140          * When there is an exact match, it is considered a better
141          * match than anything else; the destination does not even
142          * call into this function in that case.
143          */
144         unsigned long max_size, delta_size, base_size, src_copied, literal_added;
145         int score;
146
147         /* We deal only with regular files.  Symlink renames are handled
148          * only when they are exact matches --- in other words, no edits
149          * after renaming.
150          */
151         if (!S_ISREG(src->mode) || !S_ISREG(dst->mode))
152                 return 0;
153
154         /*
155          * Need to check that source and destination sizes are
156          * filled in before comparing them.
157          *
158          * If we already have "cnt_data" filled in, we know it's
159          * all good (avoid checking the size for zero, as that
160          * is a possible size - we really should have a flag to
161          * say whether the size is valid or not!)
162          */
163         dpf_opt->check_size_only = 1;
164
165         if (!src->cnt_data &&
166             diff_populate_filespec(r, src, dpf_opt))
167                 return 0;
168         if (!dst->cnt_data &&
169             diff_populate_filespec(r, dst, dpf_opt))
170                 return 0;
171
172         max_size = ((src->size > dst->size) ? src->size : dst->size);
173         base_size = ((src->size < dst->size) ? src->size : dst->size);
174         delta_size = max_size - base_size;
175
176         /* We would not consider edits that change the file size so
177          * drastically.  delta_size must be smaller than
178          * (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size).
179          *
180          * Note that base_size == 0 case is handled here already
181          * and the final score computation below would not have a
182          * divide-by-zero issue.
183          */
184         if (max_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)
185                 return 0;
186
187         dpf_opt->check_size_only = 0;
188
189         if (!src->cnt_data && diff_populate_filespec(r, src, dpf_opt))
190                 return 0;
191         if (!dst->cnt_data && diff_populate_filespec(r, dst, dpf_opt))
192                 return 0;
193
194         if (diffcore_count_changes(r, src, dst,
195                                    &src->cnt_data, &dst->cnt_data,
196                                    &src_copied, &literal_added))
197                 return 0;
198
199         /* How similar are they?
200          * what percentage of material in dst are from source?
201          */
202         if (!dst->size)
203                 score = 0; /* should not happen */
204         else
205                 score = (int)(src_copied * MAX_SCORE / max_size);
206         return score;
207 }
208
209 static void record_rename_pair(int dst_index, int src_index, int score)
210 {
211         struct diff_filepair *src = rename_src[src_index].p;
212         struct diff_filepair *dst = rename_dst[dst_index].p;
213
214         if (dst->renamed_pair)
215                 die("internal error: dst already matched.");
216
217         src->one->rename_used++;
218         src->one->count++;
219
220         rename_dst[dst_index].filespec_to_free = dst->one;
221         rename_dst[dst_index].is_rename = 1;
222
223         dst->one = src->one;
224         dst->renamed_pair = 1;
225         if (!strcmp(dst->one->path, dst->two->path))
226                 dst->score = rename_src[src_index].score;
227         else
228                 dst->score = score;
229 }
230
231 /*
232  * We sort the rename similarity matrix with the score, in descending
233  * order (the most similar first).
234  */
235 static int score_compare(const void *a_, const void *b_)
236 {
237         const struct diff_score *a = a_, *b = b_;
238
239         /* sink the unused ones to the bottom */
240         if (a->dst < 0)
241                 return (0 <= b->dst);
242         else if (b->dst < 0)
243                 return -1;
244
245         if (a->score == b->score)
246                 return b->name_score - a->name_score;
247
248         return b->score - a->score;
249 }
250
251 struct file_similarity {
252         struct hashmap_entry entry;
253         int index;
254         struct diff_filespec *filespec;
255 };
256
257 static unsigned int hash_filespec(struct repository *r,
258                                   struct diff_filespec *filespec)
259 {
260         if (!filespec->oid_valid) {
261                 if (diff_populate_filespec(r, filespec, NULL))
262                         return 0;
263                 hash_object_file(r->hash_algo, filespec->data, filespec->size,
264                                  "blob", &filespec->oid);
265         }
266         return oidhash(&filespec->oid);
267 }
268
269 static int find_identical_files(struct hashmap *srcs,
270                                 int dst_index,
271                                 struct diff_options *options)
272 {
273         int renames = 0;
274         struct diff_filespec *target = rename_dst[dst_index].p->two;
275         struct file_similarity *p, *best = NULL;
276         int i = 100, best_score = -1;
277         unsigned int hash = hash_filespec(options->repo, target);
278
279         /*
280          * Find the best source match for specified destination.
281          */
282         p = hashmap_get_entry_from_hash(srcs, hash, NULL,
283                                         struct file_similarity, entry);
284         hashmap_for_each_entry_from(srcs, p, entry) {
285                 int score;
286                 struct diff_filespec *source = p->filespec;
287
288                 /* False hash collision? */
289                 if (!oideq(&source->oid, &target->oid))
290                         continue;
291                 /* Non-regular files? If so, the modes must match! */
292                 if (!S_ISREG(source->mode) || !S_ISREG(target->mode)) {
293                         if (source->mode != target->mode)
294                                 continue;
295                 }
296                 /* Give higher scores to sources that haven't been used already */
297                 score = !source->rename_used;
298                 if (source->rename_used && options->detect_rename != DIFF_DETECT_COPY)
299                         continue;
300                 score += basename_same(source, target);
301                 if (score > best_score) {
302                         best = p;
303                         best_score = score;
304                         if (score == 2)
305                                 break;
306                 }
307
308                 /* Too many identical alternatives? Pick one */
309                 if (!--i)
310                         break;
311         }
312         if (best) {
313                 record_rename_pair(dst_index, best->index, MAX_SCORE);
314                 renames++;
315         }
316         return renames;
317 }
318
319 static void insert_file_table(struct repository *r,
320                               struct hashmap *table, int index,
321                               struct diff_filespec *filespec)
322 {
323         struct file_similarity *entry = xmalloc(sizeof(*entry));
324
325         entry->index = index;
326         entry->filespec = filespec;
327
328         hashmap_entry_init(&entry->entry, hash_filespec(r, filespec));
329         hashmap_add(table, &entry->entry);
330 }
331
332 /*
333  * Find exact renames first.
334  *
335  * The first round matches up the up-to-date entries,
336  * and then during the second round we try to match
337  * cache-dirty entries as well.
338  */
339 static int find_exact_renames(struct diff_options *options)
340 {
341         int i, renames = 0;
342         struct hashmap file_table;
343
344         /* Add all sources to the hash table in reverse order, because
345          * later on they will be retrieved in LIFO order.
346          */
347         hashmap_init(&file_table, NULL, NULL, rename_src_nr);
348         for (i = rename_src_nr-1; i >= 0; i--)
349                 insert_file_table(options->repo,
350                                   &file_table, i,
351                                   rename_src[i].p->one);
352
353         /* Walk the destinations and find best source match */
354         for (i = 0; i < rename_dst_nr; i++)
355                 renames += find_identical_files(&file_table, i, options);
356
357         /* Free the hash data structure and entries */
358         hashmap_clear_and_free(&file_table, struct file_similarity, entry);
359
360         return renames;
361 }
362
363 struct dir_rename_info {
364         struct strintmap idx_map;
365         struct strmap dir_rename_guess;
366         struct strmap *dir_rename_count;
367         struct strintmap *relevant_source_dirs;
368         unsigned setup;
369 };
370
371 static char *get_dirname(const char *filename)
372 {
373         char *slash = strrchr(filename, '/');
374         return slash ? xstrndup(filename, slash - filename) : xstrdup("");
375 }
376
377 static void dirname_munge(char *filename)
378 {
379         char *slash = strrchr(filename, '/');
380         if (!slash)
381                 slash = filename;
382         *slash = '\0';
383 }
384
385 static const char *get_highest_rename_path(struct strintmap *counts)
386 {
387         int highest_count = 0;
388         const char *highest_destination_dir = NULL;
389         struct hashmap_iter iter;
390         struct strmap_entry *entry;
391
392         strintmap_for_each_entry(counts, &iter, entry) {
393                 const char *destination_dir = entry->key;
394                 intptr_t count = (intptr_t)entry->value;
395                 if (count > highest_count) {
396                         highest_count = count;
397                         highest_destination_dir = destination_dir;
398                 }
399         }
400         return highest_destination_dir;
401 }
402
403 static char *UNKNOWN_DIR = "/";  /* placeholder -- short, illegal directory */
404
405 static int dir_rename_already_determinable(struct strintmap *counts)
406 {
407         struct hashmap_iter iter;
408         struct strmap_entry *entry;
409         int first = 0, second = 0, unknown = 0;
410         strintmap_for_each_entry(counts, &iter, entry) {
411                 const char *destination_dir = entry->key;
412                 intptr_t count = (intptr_t)entry->value;
413                 if (!strcmp(destination_dir, UNKNOWN_DIR)) {
414                         unknown = count;
415                 } else if (count >= first) {
416                         second = first;
417                         first = count;
418                 } else if (count >= second) {
419                         second = count;
420                 }
421         }
422         return first > second + unknown;
423 }
424
425 static void increment_count(struct dir_rename_info *info,
426                             char *old_dir,
427                             char *new_dir)
428 {
429         struct strintmap *counts;
430         struct strmap_entry *e;
431
432         /* Get the {new_dirs -> counts} mapping using old_dir */
433         e = strmap_get_entry(info->dir_rename_count, old_dir);
434         if (e) {
435                 counts = e->value;
436         } else {
437                 counts = xmalloc(sizeof(*counts));
438                 strintmap_init_with_options(counts, 0, NULL, 1);
439                 strmap_put(info->dir_rename_count, old_dir, counts);
440         }
441
442         /* Increment the count for new_dir */
443         strintmap_incr(counts, new_dir, 1);
444 }
445
446 static void update_dir_rename_counts(struct dir_rename_info *info,
447                                      struct strintmap *dirs_removed,
448                                      const char *oldname,
449                                      const char *newname)
450 {
451         char *old_dir = xstrdup(oldname);
452         char *new_dir = xstrdup(newname);
453         char new_dir_first_char = new_dir[0];
454         int first_time_in_loop = 1;
455
456         if (!info->setup)
457                 /*
458                  * info->setup is 0 here in two cases: (1) all auxiliary
459                  * vars (like dirs_removed) were NULL so
460                  * initialize_dir_rename_info() returned early, or (2)
461                  * either break detection or copy detection are active so
462                  * that we never called initialize_dir_rename_info().  In
463                  * the former case, we don't have enough info to know if
464                  * directories were renamed (because dirs_removed lets us
465                  * know about a necessary prerequisite, namely if they were
466                  * removed), and in the latter, we don't care about
467                  * directory renames or find_basename_matches.
468                  *
469                  * This matters because both basename and inexact matching
470                  * will also call update_dir_rename_counts().  In either of
471                  * the above two cases info->dir_rename_counts will not
472                  * have been properly initialized which prevents us from
473                  * updating it, but in these two cases we don't care about
474                  * dir_rename_counts anyway, so we can just exit early.
475                  */
476                 return;
477
478         while (1) {
479                 int drd_flag = NOT_RELEVANT;
480
481                 /* Get old_dir, skip if its directory isn't relevant. */
482                 dirname_munge(old_dir);
483                 if (info->relevant_source_dirs &&
484                     !strintmap_contains(info->relevant_source_dirs, old_dir))
485                         break;
486
487                 /* Get new_dir */
488                 dirname_munge(new_dir);
489
490                 /*
491                  * When renaming
492                  *   "a/b/c/d/e/foo.c" -> "a/b/some/thing/else/e/foo.c"
493                  * then this suggests that both
494                  *   a/b/c/d/e/ => a/b/some/thing/else/e/
495                  *   a/b/c/d/   => a/b/some/thing/else/
496                  * so we want to increment counters for both.  We do NOT,
497                  * however, also want to suggest that there was the following
498                  * rename:
499                  *   a/b/c/ => a/b/some/thing/
500                  * so we need to quit at that point.
501                  *
502                  * Note the when first_time_in_loop, we only strip off the
503                  * basename, and we don't care if that's different.
504                  */
505                 if (!first_time_in_loop) {
506                         char *old_sub_dir = strchr(old_dir, '\0')+1;
507                         char *new_sub_dir = strchr(new_dir, '\0')+1;
508                         if (!*new_dir) {
509                                 /*
510                                  * Special case when renaming to root directory,
511                                  * i.e. when new_dir == "".  In this case, we had
512                                  * something like
513                                  *    a/b/subdir => subdir
514                                  * and so dirname_munge() sets things up so that
515                                  *    old_dir = "a/b\0subdir\0"
516                                  *    new_dir = "\0ubdir\0"
517                                  * We didn't have a '/' to overwrite a '\0' onto
518                                  * in new_dir, so we have to compare differently.
519                                  */
520                                 if (new_dir_first_char != old_sub_dir[0] ||
521                                     strcmp(old_sub_dir+1, new_sub_dir))
522                                         break;
523                         } else {
524                                 if (strcmp(old_sub_dir, new_sub_dir))
525                                         break;
526                         }
527                 }
528
529                 /*
530                  * Above we suggested that we'd keep recording renames for
531                  * all ancestor directories where the trailing directories
532                  * matched, i.e. for
533                  *   "a/b/c/d/e/foo.c" -> "a/b/some/thing/else/e/foo.c"
534                  * we'd increment rename counts for each of
535                  *   a/b/c/d/e/ => a/b/some/thing/else/e/
536                  *   a/b/c/d/   => a/b/some/thing/else/
537                  * However, we only need the rename counts for directories
538                  * in dirs_removed whose value is RELEVANT_FOR_SELF.
539                  * However, we add one special case of also recording it for
540                  * first_time_in_loop because find_basename_matches() can
541                  * use that as a hint to find a good pairing.
542                  */
543                 if (dirs_removed)
544                         drd_flag = strintmap_get(dirs_removed, old_dir);
545                 if (drd_flag == RELEVANT_FOR_SELF || first_time_in_loop)
546                         increment_count(info, old_dir, new_dir);
547
548                 first_time_in_loop = 0;
549                 if (drd_flag == NOT_RELEVANT)
550                         break;
551                 /* If we hit toplevel directory ("") for old or new dir, quit */
552                 if (!*old_dir || !*new_dir)
553                         break;
554         }
555
556         /* Free resources we don't need anymore */
557         free(old_dir);
558         free(new_dir);
559 }
560
561 static void initialize_dir_rename_info(struct dir_rename_info *info,
562                                        struct strintmap *relevant_sources,
563                                        struct strintmap *dirs_removed,
564                                        struct strmap *dir_rename_count,
565                                        struct strmap *cached_pairs)
566 {
567         struct hashmap_iter iter;
568         struct strmap_entry *entry;
569         int i;
570
571         if (!dirs_removed && !relevant_sources) {
572                 info->setup = 0;
573                 return;
574         }
575         info->setup = 1;
576
577         info->dir_rename_count = dir_rename_count;
578         if (!info->dir_rename_count) {
579                 info->dir_rename_count = xmalloc(sizeof(*dir_rename_count));
580                 strmap_init(info->dir_rename_count);
581         }
582         strintmap_init_with_options(&info->idx_map, -1, NULL, 0);
583         strmap_init_with_options(&info->dir_rename_guess, NULL, 0);
584
585         /* Setup info->relevant_source_dirs */
586         info->relevant_source_dirs = NULL;
587         if (dirs_removed || !relevant_sources) {
588                 info->relevant_source_dirs = dirs_removed; /* might be NULL */
589         } else {
590                 info->relevant_source_dirs = xmalloc(sizeof(struct strintmap));
591                 strintmap_init(info->relevant_source_dirs, 0 /* unused */);
592                 strintmap_for_each_entry(relevant_sources, &iter, entry) {
593                         char *dirname = get_dirname(entry->key);
594                         if (!dirs_removed ||
595                             strintmap_contains(dirs_removed, dirname))
596                                 strintmap_set(info->relevant_source_dirs,
597                                               dirname, 0 /* value irrelevant */);
598                         free(dirname);
599                 }
600         }
601
602         /*
603          * Loop setting up both info->idx_map, and doing setup of
604          * info->dir_rename_count.
605          */
606         for (i = 0; i < rename_dst_nr; ++i) {
607                 /*
608                  * For non-renamed files, make idx_map contain mapping of
609                  *   filename -> index (index within rename_dst, that is)
610                  */
611                 if (!rename_dst[i].is_rename) {
612                         char *filename = rename_dst[i].p->two->path;
613                         strintmap_set(&info->idx_map, filename, i);
614                         continue;
615                 }
616
617                 /*
618                  * For everything else (i.e. renamed files), make
619                  * dir_rename_count contain a map of a map:
620                  *   old_directory -> {new_directory -> count}
621                  * In other words, for every pair look at the directories for
622                  * the old filename and the new filename and count how many
623                  * times that pairing occurs.
624                  */
625                 update_dir_rename_counts(info, dirs_removed,
626                                          rename_dst[i].p->one->path,
627                                          rename_dst[i].p->two->path);
628         }
629
630         /* Add cached_pairs to counts */
631         strmap_for_each_entry(cached_pairs, &iter, entry) {
632                 const char *old_name = entry->key;
633                 const char *new_name = entry->value;
634                 if (!new_name)
635                         /* known delete; ignore it */
636                         continue;
637
638                 update_dir_rename_counts(info, dirs_removed, old_name, new_name);
639         }
640
641         /*
642          * Now we collapse
643          *    dir_rename_count: old_directory -> {new_directory -> count}
644          * down to
645          *    dir_rename_guess: old_directory -> best_new_directory
646          * where best_new_directory is the one with the highest count.
647          */
648         strmap_for_each_entry(info->dir_rename_count, &iter, entry) {
649                 /* entry->key is source_dir */
650                 struct strintmap *counts = entry->value;
651                 char *best_newdir;
652
653                 best_newdir = xstrdup(get_highest_rename_path(counts));
654                 strmap_put(&info->dir_rename_guess, entry->key,
655                            best_newdir);
656         }
657 }
658
659 void partial_clear_dir_rename_count(struct strmap *dir_rename_count)
660 {
661         struct hashmap_iter iter;
662         struct strmap_entry *entry;
663
664         strmap_for_each_entry(dir_rename_count, &iter, entry) {
665                 struct strintmap *counts = entry->value;
666                 strintmap_clear(counts);
667         }
668         strmap_partial_clear(dir_rename_count, 1);
669 }
670
671 static void cleanup_dir_rename_info(struct dir_rename_info *info,
672                                     struct strintmap *dirs_removed,
673                                     int keep_dir_rename_count)
674 {
675         struct hashmap_iter iter;
676         struct strmap_entry *entry;
677         struct string_list to_remove = STRING_LIST_INIT_NODUP;
678         int i;
679
680         if (!info->setup)
681                 return;
682
683         /* idx_map */
684         strintmap_clear(&info->idx_map);
685
686         /* dir_rename_guess */
687         strmap_clear(&info->dir_rename_guess, 1);
688
689         /* relevant_source_dirs */
690         if (info->relevant_source_dirs &&
691             info->relevant_source_dirs != dirs_removed) {
692                 strintmap_clear(info->relevant_source_dirs);
693                 FREE_AND_NULL(info->relevant_source_dirs);
694         }
695
696         /* dir_rename_count */
697         if (!keep_dir_rename_count) {
698                 partial_clear_dir_rename_count(info->dir_rename_count);
699                 strmap_clear(info->dir_rename_count, 1);
700                 FREE_AND_NULL(info->dir_rename_count);
701                 return;
702         }
703
704         /*
705          * Although dir_rename_count was passed in
706          * diffcore_rename_extended() and we want to keep it around and
707          * return it to that caller, we first want to remove any counts in
708          * the maps associated with UNKNOWN_DIR entries and any data
709          * associated with directories that weren't renamed.
710          */
711         strmap_for_each_entry(info->dir_rename_count, &iter, entry) {
712                 const char *source_dir = entry->key;
713                 struct strintmap *counts = entry->value;
714
715                 if (!strintmap_get(dirs_removed, source_dir)) {
716                         string_list_append(&to_remove, source_dir);
717                         strintmap_clear(counts);
718                         continue;
719                 }
720
721                 if (strintmap_contains(counts, UNKNOWN_DIR))
722                         strintmap_remove(counts, UNKNOWN_DIR);
723         }
724         for (i = 0; i < to_remove.nr; ++i)
725                 strmap_remove(info->dir_rename_count,
726                               to_remove.items[i].string, 1);
727         string_list_clear(&to_remove, 0);
728 }
729
730 static const char *get_basename(const char *filename)
731 {
732         /*
733          * gitbasename() has to worry about special drives, multiple
734          * directory separator characters, trailing slashes, NULL or
735          * empty strings, etc.  We only work on filenames as stored in
736          * git, and thus get to ignore all those complications.
737          */
738         const char *base = strrchr(filename, '/');
739         return base ? base + 1 : filename;
740 }
741
742 static int idx_possible_rename(char *filename, struct dir_rename_info *info)
743 {
744         /*
745          * Our comparison of files with the same basename (see
746          * find_basename_matches() below), is only helpful when after exact
747          * rename detection we have exactly one file with a given basename
748          * among the rename sources and also only exactly one file with
749          * that basename among the rename destinations.  When we have
750          * multiple files with the same basename in either set, we do not
751          * know which to compare against.  However, there are some
752          * filenames that occur in large numbers (particularly
753          * build-related filenames such as 'Makefile', '.gitignore', or
754          * 'build.gradle' that potentially exist within every single
755          * subdirectory), and for performance we want to be able to quickly
756          * find renames for these files too.
757          *
758          * The reason basename comparisons are a useful heuristic was that it
759          * is common for people to move files across directories while keeping
760          * their filename the same.  If we had a way of determining or even
761          * making a good educated guess about which directory these non-unique
762          * basename files had moved the file to, we could check it.
763          * Luckily...
764          *
765          * When an entire directory is in fact renamed, we have two factors
766          * helping us out:
767          *   (a) the original directory disappeared giving us a hint
768          *       about when we can apply an extra heuristic.
769          *   (a) we often have several files within that directory and
770          *       subdirectories that are renamed without changes
771          * So, rules for a heuristic:
772          *   (0) If there basename matches are non-unique (the condition under
773          *       which this function is called) AND
774          *   (1) the directory in which the file was found has disappeared
775          *       (i.e. dirs_removed is non-NULL and has a relevant entry) THEN
776          *   (2) use exact renames of files within the directory to determine
777          *       where the directory is likely to have been renamed to.  IF
778          *       there is at least one exact rename from within that
779          *       directory, we can proceed.
780          *   (3) If there are multiple places the directory could have been
781          *       renamed to based on exact renames, ignore all but one of them.
782          *       Just use the destination with the most renames going to it.
783          *   (4) Check if applying that directory rename to the original file
784          *       would result in a destination filename that is in the
785          *       potential rename set.  If so, return the index of the
786          *       destination file (the index within rename_dst).
787          *   (5) Compare the original file and returned destination for
788          *       similarity, and if they are sufficiently similar, record the
789          *       rename.
790          *
791          * This function, idx_possible_rename(), is only responsible for (4).
792          * The conditions/steps in (1)-(3) are handled via setting up
793          * dir_rename_count and dir_rename_guess in
794          * initialize_dir_rename_info().  Steps (0) and (5) are handled by
795          * the caller of this function.
796          */
797         char *old_dir, *new_dir;
798         struct strbuf new_path = STRBUF_INIT;
799         int idx;
800
801         if (!info->setup)
802                 return -1;
803
804         old_dir = get_dirname(filename);
805         new_dir = strmap_get(&info->dir_rename_guess, old_dir);
806         free(old_dir);
807         if (!new_dir)
808                 return -1;
809
810         strbuf_addstr(&new_path, new_dir);
811         strbuf_addch(&new_path, '/');
812         strbuf_addstr(&new_path, get_basename(filename));
813
814         idx = strintmap_get(&info->idx_map, new_path.buf);
815         strbuf_release(&new_path);
816         return idx;
817 }
818
819 struct basename_prefetch_options {
820         struct repository *repo;
821         struct strintmap *relevant_sources;
822         struct strintmap *sources;
823         struct strintmap *dests;
824         struct dir_rename_info *info;
825 };
826 static void basename_prefetch(void *prefetch_options)
827 {
828         struct basename_prefetch_options *options = prefetch_options;
829         struct strintmap *relevant_sources = options->relevant_sources;
830         struct strintmap *sources = options->sources;
831         struct strintmap *dests = options->dests;
832         struct dir_rename_info *info = options->info;
833         int i;
834         struct oid_array to_fetch = OID_ARRAY_INIT;
835
836         /*
837          * TODO: The following loops mirror the code/logic from
838          * find_basename_matches(), though not quite exactly.  Maybe
839          * abstract the iteration logic out somehow?
840          */
841         for (i = 0; i < rename_src_nr; ++i) {
842                 char *filename = rename_src[i].p->one->path;
843                 const char *base = NULL;
844                 intptr_t src_index;
845                 intptr_t dst_index;
846
847                 /* Skip irrelevant sources */
848                 if (relevant_sources &&
849                     !strintmap_contains(relevant_sources, filename))
850                         continue;
851
852                 /*
853                  * If the basename is unique among remaining sources, then
854                  * src_index will equal 'i' and we can attempt to match it
855                  * to a unique basename in the destinations.  Otherwise,
856                  * use directory rename heuristics, if possible.
857                  */
858                 base = get_basename(filename);
859                 src_index = strintmap_get(sources, base);
860                 assert(src_index == -1 || src_index == i);
861
862                 if (strintmap_contains(dests, base)) {
863                         struct diff_filespec *one, *two;
864
865                         /* Find a matching destination, if possible */
866                         dst_index = strintmap_get(dests, base);
867                         if (src_index == -1 || dst_index == -1) {
868                                 src_index = i;
869                                 dst_index = idx_possible_rename(filename, info);
870                         }
871                         if (dst_index == -1)
872                                 continue;
873
874                         /* Ignore this dest if already used in a rename */
875                         if (rename_dst[dst_index].is_rename)
876                                 continue; /* already used previously */
877
878                         one = rename_src[src_index].p->one;
879                         two = rename_dst[dst_index].p->two;
880
881                         /* Add the pairs */
882                         diff_add_if_missing(options->repo, &to_fetch, two);
883                         diff_add_if_missing(options->repo, &to_fetch, one);
884                 }
885         }
886
887         promisor_remote_get_direct(options->repo, to_fetch.oid, to_fetch.nr);
888         oid_array_clear(&to_fetch);
889 }
890
891 static int find_basename_matches(struct diff_options *options,
892                                  int minimum_score,
893                                  struct dir_rename_info *info,
894                                  struct strintmap *relevant_sources,
895                                  struct strintmap *dirs_removed)
896 {
897         /*
898          * When I checked in early 2020, over 76% of file renames in linux
899          * just moved files to a different directory but kept the same
900          * basename.  gcc did that with over 64% of renames, gecko did it
901          * with over 79%, and WebKit did it with over 89%.
902          *
903          * Therefore we can bypass the normal exhaustive NxM matrix
904          * comparison of similarities between all potential rename sources
905          * and destinations by instead using file basename as a hint (i.e.
906          * the portion of the filename after the last '/'), checking for
907          * similarity between files with the same basename, and if we find
908          * a pair that are sufficiently similar, record the rename pair and
909          * exclude those two from the NxM matrix.
910          *
911          * This *might* cause us to find a less than optimal pairing (if
912          * there is another file that we are even more similar to but has a
913          * different basename).  Given the huge performance advantage
914          * basename matching provides, and given the frequency with which
915          * people use the same basename in real world projects, that's a
916          * trade-off we are willing to accept when doing just rename
917          * detection.
918          *
919          * If someone wants copy detection that implies they are willing to
920          * spend more cycles to find similarities between files, so it may
921          * be less likely that this heuristic is wanted.  If someone is
922          * doing break detection, that means they do not want filename
923          * similarity to imply any form of content similiarity, and thus
924          * this heuristic would definitely be incompatible.
925          */
926
927         int i, renames = 0;
928         struct strintmap sources;
929         struct strintmap dests;
930         struct diff_populate_filespec_options dpf_options = {
931                 .check_binary = 0,
932                 .missing_object_cb = NULL,
933                 .missing_object_data = NULL
934         };
935         struct basename_prefetch_options prefetch_options = {
936                 .repo = options->repo,
937                 .relevant_sources = relevant_sources,
938                 .sources = &sources,
939                 .dests = &dests,
940                 .info = info
941         };
942
943         /*
944          * Create maps of basename -> fullname(s) for remaining sources and
945          * dests.
946          */
947         strintmap_init_with_options(&sources, -1, NULL, 0);
948         strintmap_init_with_options(&dests, -1, NULL, 0);
949         for (i = 0; i < rename_src_nr; ++i) {
950                 char *filename = rename_src[i].p->one->path;
951                 const char *base;
952
953                 /* exact renames removed in remove_unneeded_paths_from_src() */
954                 assert(!rename_src[i].p->one->rename_used);
955
956                 /* Record index within rename_src (i) if basename is unique */
957                 base = get_basename(filename);
958                 if (strintmap_contains(&sources, base))
959                         strintmap_set(&sources, base, -1);
960                 else
961                         strintmap_set(&sources, base, i);
962         }
963         for (i = 0; i < rename_dst_nr; ++i) {
964                 char *filename = rename_dst[i].p->two->path;
965                 const char *base;
966
967                 if (rename_dst[i].is_rename)
968                         continue; /* involved in exact match already. */
969
970                 /* Record index within rename_dst (i) if basename is unique */
971                 base = get_basename(filename);
972                 if (strintmap_contains(&dests, base))
973                         strintmap_set(&dests, base, -1);
974                 else
975                         strintmap_set(&dests, base, i);
976         }
977
978         if (options->repo == the_repository && has_promisor_remote()) {
979                 dpf_options.missing_object_cb = basename_prefetch;
980                 dpf_options.missing_object_data = &prefetch_options;
981         }
982
983         /* Now look for basename matchups and do similarity estimation */
984         for (i = 0; i < rename_src_nr; ++i) {
985                 char *filename = rename_src[i].p->one->path;
986                 const char *base = NULL;
987                 intptr_t src_index;
988                 intptr_t dst_index;
989
990                 /* Skip irrelevant sources */
991                 if (relevant_sources &&
992                     !strintmap_contains(relevant_sources, filename))
993                         continue;
994
995                 /*
996                  * If the basename is unique among remaining sources, then
997                  * src_index will equal 'i' and we can attempt to match it
998                  * to a unique basename in the destinations.  Otherwise,
999                  * use directory rename heuristics, if possible.
1000                  */
1001                 base = get_basename(filename);
1002                 src_index = strintmap_get(&sources, base);
1003                 assert(src_index == -1 || src_index == i);
1004
1005                 if (strintmap_contains(&dests, base)) {
1006                         struct diff_filespec *one, *two;
1007                         int score;
1008
1009                         /* Find a matching destination, if possible */
1010                         dst_index = strintmap_get(&dests, base);
1011                         if (src_index == -1 || dst_index == -1) {
1012                                 src_index = i;
1013                                 dst_index = idx_possible_rename(filename, info);
1014                         }
1015                         if (dst_index == -1)
1016                                 continue;
1017
1018                         /* Ignore this dest if already used in a rename */
1019                         if (rename_dst[dst_index].is_rename)
1020                                 continue; /* already used previously */
1021
1022                         /* Estimate the similarity */
1023                         one = rename_src[src_index].p->one;
1024                         two = rename_dst[dst_index].p->two;
1025                         score = estimate_similarity(options->repo, one, two,
1026                                                     minimum_score, &dpf_options);
1027
1028                         /* If sufficiently similar, record as rename pair */
1029                         if (score < minimum_score)
1030                                 continue;
1031                         record_rename_pair(dst_index, src_index, score);
1032                         renames++;
1033                         update_dir_rename_counts(info, dirs_removed,
1034                                                  one->path, two->path);
1035
1036                         /*
1037                          * Found a rename so don't need text anymore; if we
1038                          * didn't find a rename, the filespec_blob would get
1039                          * re-used when doing the matrix of comparisons.
1040                          */
1041                         diff_free_filespec_blob(one);
1042                         diff_free_filespec_blob(two);
1043                 }
1044         }
1045
1046         strintmap_clear(&sources);
1047         strintmap_clear(&dests);
1048
1049         return renames;
1050 }
1051
1052 #define NUM_CANDIDATE_PER_DST 4
1053 static void record_if_better(struct diff_score m[], struct diff_score *o)
1054 {
1055         int i, worst;
1056
1057         /* find the worst one */
1058         worst = 0;
1059         for (i = 1; i < NUM_CANDIDATE_PER_DST; i++)
1060                 if (score_compare(&m[i], &m[worst]) > 0)
1061                         worst = i;
1062
1063         /* is it better than the worst one? */
1064         if (score_compare(&m[worst], o) > 0)
1065                 m[worst] = *o;
1066 }
1067
1068 /*
1069  * Returns:
1070  * 0 if we are under the limit;
1071  * 1 if we need to disable inexact rename detection;
1072  * 2 if we would be under the limit if we were given -C instead of -C -C.
1073  */
1074 static int too_many_rename_candidates(int num_destinations, int num_sources,
1075                                       struct diff_options *options)
1076 {
1077         int rename_limit = options->rename_limit;
1078         int i, limited_sources;
1079
1080         options->needed_rename_limit = 0;
1081
1082         /*
1083          * This basically does a test for the rename matrix not
1084          * growing larger than a "rename_limit" square matrix, ie:
1085          *
1086          *    num_destinations * num_sources > rename_limit * rename_limit
1087          *
1088          * We use st_mult() to check overflow conditions; in the
1089          * exceptional circumstance that size_t isn't large enough to hold
1090          * the multiplication, the system won't be able to allocate enough
1091          * memory for the matrix anyway.
1092          */
1093         if (rename_limit <= 0)
1094                 rename_limit = 32767;
1095         if (st_mult(num_destinations, num_sources)
1096             <= st_mult(rename_limit, rename_limit))
1097                 return 0;
1098
1099         options->needed_rename_limit =
1100                 num_sources > num_destinations ? num_sources : num_destinations;
1101
1102         /* Are we running under -C -C? */
1103         if (!options->flags.find_copies_harder)
1104                 return 1;
1105
1106         /* Would we bust the limit if we were running under -C? */
1107         for (limited_sources = i = 0; i < num_sources; i++) {
1108                 if (diff_unmodified_pair(rename_src[i].p))
1109                         continue;
1110                 limited_sources++;
1111         }
1112         if (st_mult(num_destinations, limited_sources)
1113             <= st_mult(rename_limit, rename_limit))
1114                 return 2;
1115         return 1;
1116 }
1117
1118 static int find_renames(struct diff_score *mx,
1119                         int dst_cnt,
1120                         int minimum_score,
1121                         int copies,
1122                         struct dir_rename_info *info,
1123                         struct strintmap *dirs_removed)
1124 {
1125         int count = 0, i;
1126
1127         for (i = 0; i < dst_cnt * NUM_CANDIDATE_PER_DST; i++) {
1128                 struct diff_rename_dst *dst;
1129
1130                 if ((mx[i].dst < 0) ||
1131                     (mx[i].score < minimum_score))
1132                         break; /* there is no more usable pair. */
1133                 dst = &rename_dst[mx[i].dst];
1134                 if (dst->is_rename)
1135                         continue; /* already done, either exact or fuzzy. */
1136                 if (!copies && rename_src[mx[i].src].p->one->rename_used)
1137                         continue;
1138                 record_rename_pair(mx[i].dst, mx[i].src, mx[i].score);
1139                 count++;
1140                 update_dir_rename_counts(info, dirs_removed,
1141                                          rename_src[mx[i].src].p->one->path,
1142                                          rename_dst[mx[i].dst].p->two->path);
1143         }
1144         return count;
1145 }
1146
1147 static void remove_unneeded_paths_from_src(int detecting_copies,
1148                                            struct strintmap *interesting)
1149 {
1150         int i, new_num_src;
1151
1152         if (detecting_copies && !interesting)
1153                 return; /* nothing to remove */
1154         if (break_idx)
1155                 return; /* culling incompatible with break detection */
1156
1157         /*
1158          * Note on reasons why we cull unneeded sources but not destinations:
1159          *   1) Pairings are stored in rename_dst (not rename_src), which we
1160          *      need to keep around.  So, we just can't cull rename_dst even
1161          *      if we wanted to.  But doing so wouldn't help because...
1162          *
1163          *   2) There is a matrix pairwise comparison that follows the
1164          *      "Performing inexact rename detection" progress message.
1165          *      Iterating over the destinations is done in the outer loop,
1166          *      hence we only iterate over each of those once and we can
1167          *      easily skip the outer loop early if the destination isn't
1168          *      relevant.  That's only one check per destination path to
1169          *      skip.
1170          *
1171          *      By contrast, the sources are iterated in the inner loop; if
1172          *      we check whether a source can be skipped, then we'll be
1173          *      checking it N separate times, once for each destination.
1174          *      We don't want to have to iterate over known-not-needed
1175          *      sources N times each, so avoid that by removing the sources
1176          *      from rename_src here.
1177          */
1178         for (i = 0, new_num_src = 0; i < rename_src_nr; i++) {
1179                 struct diff_filespec *one = rename_src[i].p->one;
1180
1181                 /*
1182                  * renames are stored in rename_dst, so if a rename has
1183                  * already been detected using this source, we can just
1184                  * remove the source knowing rename_dst has its info.
1185                  */
1186                 if (!detecting_copies && one->rename_used)
1187                         continue;
1188
1189                 /* If we don't care about the source path, skip it */
1190                 if (interesting && !strintmap_contains(interesting, one->path))
1191                         continue;
1192
1193                 if (new_num_src < i)
1194                         memcpy(&rename_src[new_num_src], &rename_src[i],
1195                                sizeof(struct diff_rename_src));
1196                 new_num_src++;
1197         }
1198
1199         rename_src_nr = new_num_src;
1200 }
1201
1202 static void handle_early_known_dir_renames(struct dir_rename_info *info,
1203                                            struct strintmap *relevant_sources,
1204                                            struct strintmap *dirs_removed)
1205 {
1206         /*
1207          * Directory renames are determined via an aggregate of all renames
1208          * under them and using a "majority wins" rule.  The fact that
1209          * "majority wins", though, means we don't need all the renames
1210          * under the given directory, we only need enough to ensure we have
1211          * a majority.
1212          */
1213
1214         int i, new_num_src;
1215         struct hashmap_iter iter;
1216         struct strmap_entry *entry;
1217
1218         if (!dirs_removed || !relevant_sources)
1219                 return; /* nothing to cull */
1220         if (break_idx)
1221                 return; /* culling incompatbile with break detection */
1222
1223         /*
1224          * Supplement dir_rename_count with number of potential renames,
1225          * marking all potential rename sources as mapping to UNKNOWN_DIR.
1226          */
1227         for (i = 0; i < rename_src_nr; i++) {
1228                 char *old_dir;
1229                 struct diff_filespec *one = rename_src[i].p->one;
1230
1231                 /*
1232                  * sources that are part of a rename will have already been
1233                  * removed by a prior call to remove_unneeded_paths_from_src()
1234                  */
1235                 assert(!one->rename_used);
1236
1237                 old_dir = get_dirname(one->path);
1238                 while (*old_dir != '\0' &&
1239                        NOT_RELEVANT != strintmap_get(dirs_removed, old_dir)) {
1240                         char *freeme = old_dir;
1241
1242                         increment_count(info, old_dir, UNKNOWN_DIR);
1243                         old_dir = get_dirname(old_dir);
1244
1245                         /* Free resources we don't need anymore */
1246                         free(freeme);
1247                 }
1248                 /*
1249                  * old_dir and new_dir free'd in increment_count, but
1250                  * get_dirname() gives us a new pointer we need to free for
1251                  * old_dir.  Also, if the loop runs 0 times we need old_dir
1252                  * to be freed.
1253                  */
1254                 free(old_dir);
1255         }
1256
1257         /*
1258          * For any directory which we need a potential rename detected for
1259          * (i.e. those marked as RELEVANT_FOR_SELF in dirs_removed), check
1260          * whether we have enough renames to satisfy the "majority rules"
1261          * requirement such that detecting any more renames of files under
1262          * it won't change the result.  For any such directory, mark that
1263          * we no longer need to detect a rename for it.  However, since we
1264          * might need to still detect renames for an ancestor of that
1265          * directory, use RELEVANT_FOR_ANCESTOR.
1266          */
1267         strmap_for_each_entry(info->dir_rename_count, &iter, entry) {
1268                 /* entry->key is source_dir */
1269                 struct strintmap *counts = entry->value;
1270
1271                 if (strintmap_get(dirs_removed, entry->key) ==
1272                     RELEVANT_FOR_SELF &&
1273                     dir_rename_already_determinable(counts)) {
1274                         strintmap_set(dirs_removed, entry->key,
1275                                       RELEVANT_FOR_ANCESTOR);
1276                 }
1277         }
1278
1279         for (i = 0, new_num_src = 0; i < rename_src_nr; i++) {
1280                 struct diff_filespec *one = rename_src[i].p->one;
1281                 int val;
1282
1283                 val = strintmap_get(relevant_sources, one->path);
1284
1285                 /*
1286                  * sources that were not found in relevant_sources should
1287                  * have already been removed by a prior call to
1288                  * remove_unneeded_paths_from_src()
1289                  */
1290                 assert(val != -1);
1291
1292                 if (val == RELEVANT_LOCATION) {
1293                         int removable = 1;
1294                         char *dir = get_dirname(one->path);
1295                         while (1) {
1296                                 char *freeme = dir;
1297                                 int res = strintmap_get(dirs_removed, dir);
1298
1299                                 /* Quit if not found or irrelevant */
1300                                 if (res == NOT_RELEVANT)
1301                                         break;
1302                                 /* If RELEVANT_FOR_SELF, can't remove */
1303                                 if (res == RELEVANT_FOR_SELF) {
1304                                         removable = 0;
1305                                         break;
1306                                 }
1307                                 /* Else continue searching upwards */
1308                                 assert(res == RELEVANT_FOR_ANCESTOR);
1309                                 dir = get_dirname(dir);
1310                                 free(freeme);
1311                         }
1312                         free(dir);
1313                         if (removable) {
1314                                 strintmap_set(relevant_sources, one->path,
1315                                               RELEVANT_NO_MORE);
1316                                 continue;
1317                         }
1318                 }
1319
1320                 if (new_num_src < i)
1321                         memcpy(&rename_src[new_num_src], &rename_src[i],
1322                                sizeof(struct diff_rename_src));
1323                 new_num_src++;
1324         }
1325
1326         rename_src_nr = new_num_src;
1327 }
1328
1329 void diffcore_rename_extended(struct diff_options *options,
1330                               struct strintmap *relevant_sources,
1331                               struct strintmap *dirs_removed,
1332                               struct strmap *dir_rename_count,
1333                               struct strmap *cached_pairs)
1334 {
1335         int detect_rename = options->detect_rename;
1336         int minimum_score = options->rename_score;
1337         struct diff_queue_struct *q = &diff_queued_diff;
1338         struct diff_queue_struct outq;
1339         struct diff_score *mx;
1340         int i, j, rename_count, skip_unmodified = 0;
1341         int num_destinations, dst_cnt;
1342         int num_sources, want_copies;
1343         struct progress *progress = NULL;
1344         struct dir_rename_info info;
1345         struct diff_populate_filespec_options dpf_options = {
1346                 .check_binary = 0,
1347                 .missing_object_cb = NULL,
1348                 .missing_object_data = NULL
1349         };
1350         struct inexact_prefetch_options prefetch_options = {
1351                 .repo = options->repo
1352         };
1353
1354         trace2_region_enter("diff", "setup", options->repo);
1355         info.setup = 0;
1356         assert(!dir_rename_count || strmap_empty(dir_rename_count));
1357         want_copies = (detect_rename == DIFF_DETECT_COPY);
1358         if (dirs_removed && (break_idx || want_copies))
1359                 BUG("dirs_removed incompatible with break/copy detection");
1360         if (break_idx && relevant_sources)
1361                 BUG("break detection incompatible with source specification");
1362         if (!minimum_score)
1363                 minimum_score = DEFAULT_RENAME_SCORE;
1364
1365         for (i = 0; i < q->nr; i++) {
1366                 struct diff_filepair *p = q->queue[i];
1367                 if (!DIFF_FILE_VALID(p->one)) {
1368                         if (!DIFF_FILE_VALID(p->two))
1369                                 continue; /* unmerged */
1370                         else if (options->single_follow &&
1371                                  strcmp(options->single_follow, p->two->path))
1372                                 continue; /* not interested */
1373                         else if (!options->flags.rename_empty &&
1374                                  is_empty_blob_oid(&p->two->oid))
1375                                 continue;
1376                         else if (add_rename_dst(p) < 0) {
1377                                 warning("skipping rename detection, detected"
1378                                         " duplicate destination '%s'",
1379                                         p->two->path);
1380                                 goto cleanup;
1381                         }
1382                 }
1383                 else if (!options->flags.rename_empty &&
1384                          is_empty_blob_oid(&p->one->oid))
1385                         continue;
1386                 else if (!DIFF_PAIR_UNMERGED(p) && !DIFF_FILE_VALID(p->two)) {
1387                         /*
1388                          * If the source is a broken "delete", and
1389                          * they did not really want to get broken,
1390                          * that means the source actually stays.
1391                          * So we increment the "rename_used" score
1392                          * by one, to indicate ourselves as a user
1393                          */
1394                         if (p->broken_pair && !p->score)
1395                                 p->one->rename_used++;
1396                         register_rename_src(p);
1397                 }
1398                 else if (want_copies) {
1399                         /*
1400                          * Increment the "rename_used" score by
1401                          * one, to indicate ourselves as a user.
1402                          */
1403                         p->one->rename_used++;
1404                         register_rename_src(p);
1405                 }
1406         }
1407         trace2_region_leave("diff", "setup", options->repo);
1408         if (rename_dst_nr == 0 || rename_src_nr == 0)
1409                 goto cleanup; /* nothing to do */
1410
1411         trace2_region_enter("diff", "exact renames", options->repo);
1412         /*
1413          * We really want to cull the candidates list early
1414          * with cheap tests in order to avoid doing deltas.
1415          */
1416         rename_count = find_exact_renames(options);
1417         trace2_region_leave("diff", "exact renames", options->repo);
1418
1419         /* Did we only want exact renames? */
1420         if (minimum_score == MAX_SCORE)
1421                 goto cleanup;
1422
1423         num_sources = rename_src_nr;
1424
1425         if (want_copies || break_idx) {
1426                 /*
1427                  * Cull sources:
1428                  *   - remove ones corresponding to exact renames
1429                  *   - remove ones not found in relevant_sources
1430                  */
1431                 trace2_region_enter("diff", "cull after exact", options->repo);
1432                 remove_unneeded_paths_from_src(want_copies, relevant_sources);
1433                 trace2_region_leave("diff", "cull after exact", options->repo);
1434         } else {
1435                 /* Determine minimum score to match basenames */
1436                 double factor = 0.5;
1437                 char *basename_factor = getenv("GIT_BASENAME_FACTOR");
1438                 int min_basename_score;
1439
1440                 if (basename_factor)
1441                         factor = strtol(basename_factor, NULL, 10)/100.0;
1442                 assert(factor >= 0.0 && factor <= 1.0);
1443                 min_basename_score = minimum_score +
1444                         (int)(factor * (MAX_SCORE - minimum_score));
1445
1446                 /*
1447                  * Cull sources:
1448                  *   - remove ones involved in renames (found via exact match)
1449                  */
1450                 trace2_region_enter("diff", "cull after exact", options->repo);
1451                 remove_unneeded_paths_from_src(want_copies, NULL);
1452                 trace2_region_leave("diff", "cull after exact", options->repo);
1453
1454                 /* Preparation for basename-driven matching. */
1455                 trace2_region_enter("diff", "dir rename setup", options->repo);
1456                 initialize_dir_rename_info(&info, relevant_sources,
1457                                            dirs_removed, dir_rename_count,
1458                                            cached_pairs);
1459                 trace2_region_leave("diff", "dir rename setup", options->repo);
1460
1461                 /* Utilize file basenames to quickly find renames. */
1462                 trace2_region_enter("diff", "basename matches", options->repo);
1463                 rename_count += find_basename_matches(options,
1464                                                       min_basename_score,
1465                                                       &info,
1466                                                       relevant_sources,
1467                                                       dirs_removed);
1468                 trace2_region_leave("diff", "basename matches", options->repo);
1469
1470                 /*
1471                  * Cull sources, again:
1472                  *   - remove ones involved in renames (found via basenames)
1473                  *   - remove ones not found in relevant_sources
1474                  * and
1475                  *   - remove ones in relevant_sources which are needed only
1476                  *     for directory renames IF no ancestory directory
1477                  *     actually needs to know any more individual path
1478                  *     renames under them
1479                  */
1480                 trace2_region_enter("diff", "cull basename", options->repo);
1481                 remove_unneeded_paths_from_src(want_copies, relevant_sources);
1482                 handle_early_known_dir_renames(&info, relevant_sources,
1483                                                dirs_removed);
1484                 trace2_region_leave("diff", "cull basename", options->repo);
1485         }
1486
1487         /* Calculate how many rename destinations are left */
1488         num_destinations = (rename_dst_nr - rename_count);
1489         num_sources = rename_src_nr; /* rename_src_nr reflects lower number */
1490
1491         /* All done? */
1492         if (!num_destinations || !num_sources)
1493                 goto cleanup;
1494
1495         switch (too_many_rename_candidates(num_destinations, num_sources,
1496                                            options)) {
1497         case 1:
1498                 goto cleanup;
1499         case 2:
1500                 options->degraded_cc_to_c = 1;
1501                 skip_unmodified = 1;
1502                 break;
1503         default:
1504                 break;
1505         }
1506
1507         trace2_region_enter("diff", "inexact renames", options->repo);
1508         if (options->show_rename_progress) {
1509                 progress = start_delayed_progress(
1510                                 _("Performing inexact rename detection"),
1511                                 (uint64_t)num_destinations * (uint64_t)num_sources);
1512         }
1513
1514         /* Finish setting up dpf_options */
1515         prefetch_options.skip_unmodified = skip_unmodified;
1516         if (options->repo == the_repository && has_promisor_remote()) {
1517                 dpf_options.missing_object_cb = inexact_prefetch;
1518                 dpf_options.missing_object_data = &prefetch_options;
1519         }
1520
1521         CALLOC_ARRAY(mx, st_mult(NUM_CANDIDATE_PER_DST, num_destinations));
1522         for (dst_cnt = i = 0; i < rename_dst_nr; i++) {
1523                 struct diff_filespec *two = rename_dst[i].p->two;
1524                 struct diff_score *m;
1525
1526                 if (rename_dst[i].is_rename)
1527                         continue; /* exact or basename match already handled */
1528
1529                 m = &mx[dst_cnt * NUM_CANDIDATE_PER_DST];
1530                 for (j = 0; j < NUM_CANDIDATE_PER_DST; j++)
1531                         m[j].dst = -1;
1532
1533                 for (j = 0; j < rename_src_nr; j++) {
1534                         struct diff_filespec *one = rename_src[j].p->one;
1535                         struct diff_score this_src;
1536
1537                         assert(!one->rename_used || want_copies || break_idx);
1538
1539                         if (skip_unmodified &&
1540                             diff_unmodified_pair(rename_src[j].p))
1541                                 continue;
1542
1543                         this_src.score = estimate_similarity(options->repo,
1544                                                              one, two,
1545                                                              minimum_score,
1546                                                              &dpf_options);
1547                         this_src.name_score = basename_same(one, two);
1548                         this_src.dst = i;
1549                         this_src.src = j;
1550                         record_if_better(m, &this_src);
1551                         /*
1552                          * Once we run estimate_similarity,
1553                          * We do not need the text anymore.
1554                          */
1555                         diff_free_filespec_blob(one);
1556                         diff_free_filespec_blob(two);
1557                 }
1558                 dst_cnt++;
1559                 display_progress(progress,
1560                                  (uint64_t)dst_cnt * (uint64_t)num_sources);
1561         }
1562         stop_progress(&progress);
1563
1564         /* cost matrix sorted by most to least similar pair */
1565         STABLE_QSORT(mx, dst_cnt * NUM_CANDIDATE_PER_DST, score_compare);
1566
1567         rename_count += find_renames(mx, dst_cnt, minimum_score, 0,
1568                                      &info, dirs_removed);
1569         if (want_copies)
1570                 rename_count += find_renames(mx, dst_cnt, minimum_score, 1,
1571                                              &info, dirs_removed);
1572         free(mx);
1573         trace2_region_leave("diff", "inexact renames", options->repo);
1574
1575  cleanup:
1576         /* At this point, we have found some renames and copies and they
1577          * are recorded in rename_dst.  The original list is still in *q.
1578          */
1579         trace2_region_enter("diff", "write back to queue", options->repo);
1580         DIFF_QUEUE_CLEAR(&outq);
1581         for (i = 0; i < q->nr; i++) {
1582                 struct diff_filepair *p = q->queue[i];
1583                 struct diff_filepair *pair_to_free = NULL;
1584
1585                 if (DIFF_PAIR_UNMERGED(p)) {
1586                         diff_q(&outq, p);
1587                 }
1588                 else if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) {
1589                         /* Creation */
1590                         diff_q(&outq, p);
1591                 }
1592                 else if (DIFF_FILE_VALID(p->one) && !DIFF_FILE_VALID(p->two)) {
1593                         /*
1594                          * Deletion
1595                          *
1596                          * We would output this delete record if:
1597                          *
1598                          * (1) this is a broken delete and the counterpart
1599                          *     broken create remains in the output; or
1600                          * (2) this is not a broken delete, and rename_dst
1601                          *     does not have a rename/copy to move p->one->path
1602                          *     out of existence.
1603                          *
1604                          * Otherwise, the counterpart broken create
1605                          * has been turned into a rename-edit; or
1606                          * delete did not have a matching create to
1607                          * begin with.
1608                          */
1609                         if (DIFF_PAIR_BROKEN(p)) {
1610                                 /* broken delete */
1611                                 struct diff_rename_dst *dst = locate_rename_dst(p);
1612                                 if (!dst)
1613                                         BUG("tracking failed somehow; failed to find associated dst for broken pair");
1614                                 if (dst->is_rename)
1615                                         /* counterpart is now rename/copy */
1616                                         pair_to_free = p;
1617                         }
1618                         else {
1619                                 if (p->one->rename_used)
1620                                         /* this path remains */
1621                                         pair_to_free = p;
1622                         }
1623
1624                         if (!pair_to_free)
1625                                 diff_q(&outq, p);
1626                 }
1627                 else if (!diff_unmodified_pair(p))
1628                         /* all the usual ones need to be kept */
1629                         diff_q(&outq, p);
1630                 else
1631                         /* no need to keep unmodified pairs */
1632                         pair_to_free = p;
1633
1634                 if (pair_to_free)
1635                         diff_free_filepair(pair_to_free);
1636         }
1637         diff_debug_queue("done copying original", &outq);
1638
1639         free(q->queue);
1640         *q = outq;
1641         diff_debug_queue("done collapsing", q);
1642
1643         for (i = 0; i < rename_dst_nr; i++)
1644                 if (rename_dst[i].filespec_to_free)
1645                         free_filespec(rename_dst[i].filespec_to_free);
1646
1647         cleanup_dir_rename_info(&info, dirs_removed, dir_rename_count != NULL);
1648         FREE_AND_NULL(rename_dst);
1649         rename_dst_nr = rename_dst_alloc = 0;
1650         FREE_AND_NULL(rename_src);
1651         rename_src_nr = rename_src_alloc = 0;
1652         if (break_idx) {
1653                 strintmap_clear(break_idx);
1654                 FREE_AND_NULL(break_idx);
1655         }
1656         trace2_region_leave("diff", "write back to queue", options->repo);
1657         return;
1658 }
1659
1660 void diffcore_rename(struct diff_options *options)
1661 {
1662         diffcore_rename_extended(options, NULL, NULL, NULL, NULL);
1663 }