pack-redundant: new algorithm to find min packs
[git] / builtin / pack-redundant.c
1 /*
2 *
3 * Copyright 2005, Lukas Sandstrom <lukass@etek.chalmers.se>
4 *
5 * This file is licensed under the GPL v2.
6 *
7 */
8
9 #include "builtin.h"
10 #include "repository.h"
11 #include "packfile.h"
12 #include "object-store.h"
13
14 #define BLKSIZE 512
15
16 static const char pack_redundant_usage[] =
17 "git pack-redundant [--verbose] [--alt-odb] (--all | <filename.pack>...)";
18
19 static int load_all_packs, verbose, alt_odb;
20
21 struct llist_item {
22         struct llist_item *next;
23         const struct object_id *oid;
24 };
25 static struct llist {
26         struct llist_item *front;
27         struct llist_item *back;
28         size_t size;
29 } *all_objects; /* all objects which must be present in local packfiles */
30
31 static struct pack_list {
32         struct pack_list *next;
33         struct packed_git *pack;
34         struct llist *unique_objects;
35         struct llist *all_objects;
36 } *local_packs = NULL, *altodb_packs = NULL;
37
38 static struct llist_item *free_nodes;
39
40 static inline void llist_item_put(struct llist_item *item)
41 {
42         item->next = free_nodes;
43         free_nodes = item;
44 }
45
46 static inline struct llist_item *llist_item_get(void)
47 {
48         struct llist_item *new_item;
49         if ( free_nodes ) {
50                 new_item = free_nodes;
51                 free_nodes = free_nodes->next;
52         } else {
53                 int i = 1;
54                 ALLOC_ARRAY(new_item, BLKSIZE);
55                 for (; i < BLKSIZE; i++)
56                         llist_item_put(&new_item[i]);
57         }
58         return new_item;
59 }
60
61 static inline void llist_init(struct llist **list)
62 {
63         *list = xmalloc(sizeof(struct llist));
64         (*list)->front = (*list)->back = NULL;
65         (*list)->size = 0;
66 }
67
68 static struct llist * llist_copy(struct llist *list)
69 {
70         struct llist *ret;
71         struct llist_item *new_item, *old_item, *prev;
72
73         llist_init(&ret);
74
75         if ((ret->size = list->size) == 0)
76                 return ret;
77
78         new_item = ret->front = llist_item_get();
79         new_item->oid = list->front->oid;
80
81         old_item = list->front->next;
82         while (old_item) {
83                 prev = new_item;
84                 new_item = llist_item_get();
85                 prev->next = new_item;
86                 new_item->oid = old_item->oid;
87                 old_item = old_item->next;
88         }
89         new_item->next = NULL;
90         ret->back = new_item;
91
92         return ret;
93 }
94
95 static inline struct llist_item *llist_insert(struct llist *list,
96                                               struct llist_item *after,
97                                               const struct object_id *oid)
98 {
99         struct llist_item *new_item = llist_item_get();
100         new_item->oid = oid;
101         new_item->next = NULL;
102
103         if (after != NULL) {
104                 new_item->next = after->next;
105                 after->next = new_item;
106                 if (after == list->back)
107                         list->back = new_item;
108         } else {/* insert in front */
109                 if (list->size == 0)
110                         list->back = new_item;
111                 else
112                         new_item->next = list->front;
113                 list->front = new_item;
114         }
115         list->size++;
116         return new_item;
117 }
118
119 static inline struct llist_item *llist_insert_back(struct llist *list,
120                                                    const struct object_id *oid)
121 {
122         return llist_insert(list, list->back, oid);
123 }
124
125 static inline struct llist_item *llist_insert_sorted_unique(struct llist *list,
126                         const struct object_id *oid, struct llist_item *hint)
127 {
128         struct llist_item *prev = NULL, *l;
129
130         l = (hint == NULL) ? list->front : hint;
131         while (l) {
132                 int cmp = oidcmp(l->oid, oid);
133                 if (cmp > 0) { /* we insert before this entry */
134                         return llist_insert(list, prev, oid);
135                 }
136                 if (!cmp) { /* already exists */
137                         return l;
138                 }
139                 prev = l;
140                 l = l->next;
141         }
142         /* insert at the end */
143         return llist_insert_back(list, oid);
144 }
145
146 /* returns a pointer to an item in front of sha1 */
147 static inline struct llist_item * llist_sorted_remove(struct llist *list, const struct object_id *oid, struct llist_item *hint)
148 {
149         struct llist_item *prev, *l;
150
151 redo_from_start:
152         l = (hint == NULL) ? list->front : hint;
153         prev = NULL;
154         while (l) {
155                 int cmp = oidcmp(l->oid, oid);
156                 if (cmp > 0) /* not in list, since sorted */
157                         return prev;
158                 if (!cmp) { /* found */
159                         if (prev == NULL) {
160                                 if (hint != NULL && hint != list->front) {
161                                         /* we don't know the previous element */
162                                         hint = NULL;
163                                         goto redo_from_start;
164                                 }
165                                 list->front = l->next;
166                         } else
167                                 prev->next = l->next;
168                         if (l == list->back)
169                                 list->back = prev;
170                         llist_item_put(l);
171                         list->size--;
172                         return prev;
173                 }
174                 prev = l;
175                 l = l->next;
176         }
177         return prev;
178 }
179
180 /* computes A\B */
181 static void llist_sorted_difference_inplace(struct llist *A,
182                                      struct llist *B)
183 {
184         struct llist_item *hint, *b;
185
186         hint = NULL;
187         b = B->front;
188
189         while (b) {
190                 hint = llist_sorted_remove(A, b->oid, hint);
191                 b = b->next;
192         }
193 }
194
195 static inline struct pack_list * pack_list_insert(struct pack_list **pl,
196                                            struct pack_list *entry)
197 {
198         struct pack_list *p = xmalloc(sizeof(struct pack_list));
199         memcpy(p, entry, sizeof(struct pack_list));
200         p->next = *pl;
201         *pl = p;
202         return p;
203 }
204
205 static inline size_t pack_list_size(struct pack_list *pl)
206 {
207         size_t ret = 0;
208         while (pl) {
209                 ret++;
210                 pl = pl->next;
211         }
212         return ret;
213 }
214
215 static struct pack_list * pack_list_difference(const struct pack_list *A,
216                                                const struct pack_list *B)
217 {
218         struct pack_list *ret;
219         const struct pack_list *pl;
220
221         if (A == NULL)
222                 return NULL;
223
224         pl = B;
225         while (pl != NULL) {
226                 if (A->pack == pl->pack)
227                         return pack_list_difference(A->next, B);
228                 pl = pl->next;
229         }
230         ret = xmalloc(sizeof(struct pack_list));
231         memcpy(ret, A, sizeof(struct pack_list));
232         ret->next = pack_list_difference(A->next, B);
233         return ret;
234 }
235
236 static void cmp_two_packs(struct pack_list *p1, struct pack_list *p2)
237 {
238         unsigned long p1_off = 0, p2_off = 0, p1_step, p2_step;
239         const unsigned char *p1_base, *p2_base;
240         struct llist_item *p1_hint = NULL, *p2_hint = NULL;
241         const unsigned int hashsz = the_hash_algo->rawsz;
242
243         if (!p1->unique_objects)
244                 p1->unique_objects = llist_copy(p1->all_objects);
245         if (!p2->unique_objects)
246                 p2->unique_objects = llist_copy(p2->all_objects);
247
248         p1_base = p1->pack->index_data;
249         p2_base = p2->pack->index_data;
250         p1_base += 256 * 4 + ((p1->pack->index_version < 2) ? 4 : 8);
251         p2_base += 256 * 4 + ((p2->pack->index_version < 2) ? 4 : 8);
252         p1_step = hashsz + ((p1->pack->index_version < 2) ? 4 : 0);
253         p2_step = hashsz + ((p2->pack->index_version < 2) ? 4 : 0);
254
255         while (p1_off < p1->pack->num_objects * p1_step &&
256                p2_off < p2->pack->num_objects * p2_step)
257         {
258                 int cmp = hashcmp(p1_base + p1_off, p2_base + p2_off);
259                 /* cmp ~ p1 - p2 */
260                 if (cmp == 0) {
261                         p1_hint = llist_sorted_remove(p1->unique_objects,
262                                         (const struct object_id *)(p1_base + p1_off),
263                                         p1_hint);
264                         p2_hint = llist_sorted_remove(p2->unique_objects,
265                                         (const struct object_id *)(p1_base + p1_off),
266                                         p2_hint);
267                         p1_off += p1_step;
268                         p2_off += p2_step;
269                         continue;
270                 }
271                 if (cmp < 0) { /* p1 has the object, p2 doesn't */
272                         p1_off += p1_step;
273                 } else { /* p2 has the object, p1 doesn't */
274                         p2_off += p2_step;
275                 }
276         }
277 }
278
279 static size_t sizeof_union(struct packed_git *p1, struct packed_git *p2)
280 {
281         size_t ret = 0;
282         unsigned long p1_off = 0, p2_off = 0, p1_step, p2_step;
283         const unsigned char *p1_base, *p2_base;
284         const unsigned int hashsz = the_hash_algo->rawsz;
285
286         p1_base = p1->index_data;
287         p2_base = p2->index_data;
288         p1_base += 256 * 4 + ((p1->index_version < 2) ? 4 : 8);
289         p2_base += 256 * 4 + ((p2->index_version < 2) ? 4 : 8);
290         p1_step = hashsz + ((p1->index_version < 2) ? 4 : 0);
291         p2_step = hashsz + ((p2->index_version < 2) ? 4 : 0);
292
293         while (p1_off < p1->num_objects * p1_step &&
294                p2_off < p2->num_objects * p2_step)
295         {
296                 int cmp = hashcmp(p1_base + p1_off, p2_base + p2_off);
297                 /* cmp ~ p1 - p2 */
298                 if (cmp == 0) {
299                         ret++;
300                         p1_off += p1_step;
301                         p2_off += p2_step;
302                         continue;
303                 }
304                 if (cmp < 0) { /* p1 has the object, p2 doesn't */
305                         p1_off += p1_step;
306                 } else { /* p2 has the object, p1 doesn't */
307                         p2_off += p2_step;
308                 }
309         }
310         return ret;
311 }
312
313 /* another O(n^2) function ... */
314 static size_t get_pack_redundancy(struct pack_list *pl)
315 {
316         struct pack_list *subset;
317         size_t ret = 0;
318
319         if (pl == NULL)
320                 return 0;
321
322         while ((subset = pl->next)) {
323                 while (subset) {
324                         ret += sizeof_union(pl->pack, subset->pack);
325                         subset = subset->next;
326                 }
327                 pl = pl->next;
328         }
329         return ret;
330 }
331
332 static inline off_t pack_set_bytecount(struct pack_list *pl)
333 {
334         off_t ret = 0;
335         while (pl) {
336                 ret += pl->pack->pack_size;
337                 ret += pl->pack->index_size;
338                 pl = pl->next;
339         }
340         return ret;
341 }
342
343 static int cmp_pack_list_reverse(const void *a, const void *b)
344 {
345         struct pack_list *pl_a = *((struct pack_list **)a);
346         struct pack_list *pl_b = *((struct pack_list **)b);
347         size_t sz_a = pl_a->all_objects->size;
348         size_t sz_b = pl_b->all_objects->size;
349
350         if (sz_a == sz_b)
351                 return 0;
352         else if (sz_a < sz_b)
353                 return 1;
354         else
355                 return -1;
356 }
357
358 /* Sort pack_list, greater size of all_objects first */
359 static void sort_pack_list(struct pack_list **pl)
360 {
361         struct pack_list **ary, *p;
362         int i;
363         size_t n = pack_list_size(*pl);
364
365         if (n < 2)
366                 return;
367
368         /* prepare an array of packed_list for easier sorting */
369         ary = xcalloc(n, sizeof(struct pack_list *));
370         for (n = 0, p = *pl; p; p = p->next)
371                 ary[n++] = p;
372
373         QSORT(ary, n, cmp_pack_list_reverse);
374
375         /* link them back again */
376         for (i = 0; i < n - 1; i++)
377                 ary[i]->next = ary[i + 1];
378         ary[n - 1]->next = NULL;
379         *pl = ary[0];
380
381         free(ary);
382 }
383
384
385 static void minimize(struct pack_list **min)
386 {
387         struct pack_list *pl, *unique = NULL, *non_unique = NULL;
388         struct llist *missing, *unique_pack_objects;
389
390         pl = local_packs;
391         while (pl) {
392                 if (pl->unique_objects->size)
393                         pack_list_insert(&unique, pl);
394                 else
395                         pack_list_insert(&non_unique, pl);
396                 pl = pl->next;
397         }
398         /* find out which objects are missing from the set of unique packs */
399         missing = llist_copy(all_objects);
400         pl = unique;
401         while (pl) {
402                 llist_sorted_difference_inplace(missing, pl->all_objects);
403                 pl = pl->next;
404         }
405
406         *min = unique;
407
408         /* return if there are no objects missing from the unique set */
409         if (missing->size == 0) {
410                 free(missing);
411                 return;
412         }
413
414         unique_pack_objects = llist_copy(all_objects);
415         llist_sorted_difference_inplace(unique_pack_objects, missing);
416
417         /* remove unique pack objects from the non_unique packs */
418         pl = non_unique;
419         while (pl) {
420                 llist_sorted_difference_inplace(pl->all_objects, unique_pack_objects);
421                 pl = pl->next;
422         }
423
424         while (non_unique) {
425                 /* sort the non_unique packs, greater size of all_objects first */
426                 sort_pack_list(&non_unique);
427                 if (non_unique->all_objects->size == 0)
428                         break;
429
430                 pack_list_insert(min, non_unique);
431
432                 for (pl = non_unique->next; pl && pl->all_objects->size > 0;  pl = pl->next)
433                         llist_sorted_difference_inplace(pl->all_objects, non_unique->all_objects);
434
435                 non_unique = non_unique->next;
436         }
437 }
438
439 static void load_all_objects(void)
440 {
441         struct pack_list *pl = local_packs;
442         struct llist_item *hint, *l;
443
444         llist_init(&all_objects);
445
446         while (pl) {
447                 hint = NULL;
448                 l = pl->all_objects->front;
449                 while (l) {
450                         hint = llist_insert_sorted_unique(all_objects,
451                                                           l->oid, hint);
452                         l = l->next;
453                 }
454                 pl = pl->next;
455         }
456         /* remove objects present in remote packs */
457         pl = altodb_packs;
458         while (pl) {
459                 llist_sorted_difference_inplace(all_objects, pl->all_objects);
460                 pl = pl->next;
461         }
462 }
463
464 /* this scales like O(n^2) */
465 static void cmp_local_packs(void)
466 {
467         struct pack_list *subset, *pl = local_packs;
468
469         while ((subset = pl)) {
470                 while ((subset = subset->next))
471                         cmp_two_packs(pl, subset);
472                 pl = pl->next;
473         }
474 }
475
476 static void scan_alt_odb_packs(void)
477 {
478         struct pack_list *local, *alt;
479
480         alt = altodb_packs;
481         while (alt) {
482                 local = local_packs;
483                 while (local) {
484                         llist_sorted_difference_inplace(local->all_objects,
485                                                         alt->all_objects);
486                         local = local->next;
487                 }
488                 alt = alt->next;
489         }
490 }
491
492 static struct pack_list * add_pack(struct packed_git *p)
493 {
494         struct pack_list l;
495         unsigned long off = 0, step;
496         const unsigned char *base;
497
498         if (!p->pack_local && !(alt_odb || verbose))
499                 return NULL;
500
501         l.pack = p;
502         llist_init(&l.all_objects);
503
504         if (open_pack_index(p))
505                 return NULL;
506
507         base = p->index_data;
508         base += 256 * 4 + ((p->index_version < 2) ? 4 : 8);
509         step = the_hash_algo->rawsz + ((p->index_version < 2) ? 4 : 0);
510         while (off < p->num_objects * step) {
511                 llist_insert_back(l.all_objects, (const struct object_id *)(base + off));
512                 off += step;
513         }
514         l.unique_objects = NULL;
515         if (p->pack_local)
516                 return pack_list_insert(&local_packs, &l);
517         else
518                 return pack_list_insert(&altodb_packs, &l);
519 }
520
521 static struct pack_list * add_pack_file(const char *filename)
522 {
523         struct packed_git *p = get_all_packs(the_repository);
524
525         if (strlen(filename) < 40)
526                 die("Bad pack filename: %s", filename);
527
528         while (p) {
529                 if (strstr(p->pack_name, filename))
530                         return add_pack(p);
531                 p = p->next;
532         }
533         die("Filename %s not found in packed_git", filename);
534 }
535
536 static void load_all(void)
537 {
538         struct packed_git *p = get_all_packs(the_repository);
539
540         while (p) {
541                 add_pack(p);
542                 p = p->next;
543         }
544 }
545
546 int cmd_pack_redundant(int argc, const char **argv, const char *prefix)
547 {
548         int i;
549         struct pack_list *min = NULL, *red, *pl;
550         struct llist *ignore;
551         struct object_id *oid;
552         char buf[GIT_MAX_HEXSZ + 2]; /* hex hash + \n + \0 */
553
554         if (argc == 2 && !strcmp(argv[1], "-h"))
555                 usage(pack_redundant_usage);
556
557         for (i = 1; i < argc; i++) {
558                 const char *arg = argv[i];
559                 if (!strcmp(arg, "--")) {
560                         i++;
561                         break;
562                 }
563                 if (!strcmp(arg, "--all")) {
564                         load_all_packs = 1;
565                         continue;
566                 }
567                 if (!strcmp(arg, "--verbose")) {
568                         verbose = 1;
569                         continue;
570                 }
571                 if (!strcmp(arg, "--alt-odb")) {
572                         alt_odb = 1;
573                         continue;
574                 }
575                 if (*arg == '-')
576                         usage(pack_redundant_usage);
577                 else
578                         break;
579         }
580
581         if (load_all_packs)
582                 load_all();
583         else
584                 while (*(argv + i) != NULL)
585                         add_pack_file(*(argv + i++));
586
587         if (local_packs == NULL)
588                 die("Zero packs found!");
589
590         load_all_objects();
591
592         if (alt_odb)
593                 scan_alt_odb_packs();
594
595         /* ignore objects given on stdin */
596         llist_init(&ignore);
597         if (!isatty(0)) {
598                 while (fgets(buf, sizeof(buf), stdin)) {
599                         oid = xmalloc(sizeof(*oid));
600                         if (get_oid_hex(buf, oid))
601                                 die("Bad object ID on stdin: %s", buf);
602                         llist_insert_sorted_unique(ignore, oid, NULL);
603                 }
604         }
605         llist_sorted_difference_inplace(all_objects, ignore);
606         pl = local_packs;
607         while (pl) {
608                 llist_sorted_difference_inplace(pl->all_objects, ignore);
609                 pl = pl->next;
610         }
611
612         cmp_local_packs();
613
614         minimize(&min);
615
616         if (verbose) {
617                 fprintf(stderr, "There are %lu packs available in alt-odbs.\n",
618                         (unsigned long)pack_list_size(altodb_packs));
619                 fprintf(stderr, "The smallest (bytewise) set of packs is:\n");
620                 pl = min;
621                 while (pl) {
622                         fprintf(stderr, "\t%s\n", pl->pack->pack_name);
623                         pl = pl->next;
624                 }
625                 fprintf(stderr, "containing %lu duplicate objects "
626                                 "with a total size of %lukb.\n",
627                         (unsigned long)get_pack_redundancy(min),
628                         (unsigned long)pack_set_bytecount(min)/1024);
629                 fprintf(stderr, "A total of %lu unique objects were considered.\n",
630                         (unsigned long)all_objects->size);
631                 fprintf(stderr, "Redundant packs (with indexes):\n");
632         }
633         pl = red = pack_list_difference(local_packs, min);
634         while (pl) {
635                 printf("%s\n%s\n",
636                        sha1_pack_index_name(pl->pack->sha1),
637                        pl->pack->pack_name);
638                 pl = pl->next;
639         }
640         if (verbose)
641                 fprintf(stderr, "%luMB of redundant packs in total.\n",
642                         (unsigned long)pack_set_bytecount(red)/(1024*1024));
643
644         return 0;
645 }