Added automatic index generation to fast-import.
[git] / fast-import.c
1 #include "builtin.h"
2 #include "cache.h"
3 #include "object.h"
4 #include "blob.h"
5 #include "delta.h"
6 #include "pack.h"
7 #include "csum-file.h"
8
9 static int max_depth = 10;
10 static unsigned long object_count;
11 static unsigned long duplicate_count;
12 static unsigned long packoff;
13 static unsigned long overflow_count;
14 static int packfd;
15 static int current_depth;
16 static void *lastdat;
17 static unsigned long lastdatlen;
18 static unsigned char lastsha1[20];
19 static unsigned char packsha1[20];
20
21 struct object_entry
22 {
23         struct object_entry *next;
24         unsigned long offset;
25         unsigned char sha1[20];
26 };
27
28 struct overflow_object_entry
29 {
30         struct overflow_object_entry *next;
31         struct object_entry oe;
32 };
33
34 struct object_entry *pool_start;
35 struct object_entry *pool_next;
36 struct object_entry *pool_end;
37 struct overflow_object_entry *overflow;
38 struct object_entry *table[1 << 16];
39
40 static struct object_entry* new_object(unsigned char *sha1)
41 {
42         if (pool_next != pool_end) {
43                 struct object_entry *e = pool_next++;
44                 memcpy(e->sha1, sha1, sizeof(e->sha1));
45                 return e;
46         } else {
47                 struct overflow_object_entry *e;
48
49                 e = xmalloc(sizeof(struct overflow_object_entry));
50                 e->next = overflow;
51                 memcpy(e->oe.sha1, sha1, sizeof(e->oe.sha1));
52                 overflow = e;
53                 overflow_count++;
54                 return &e->oe;
55         }
56 }
57
58 static struct object_entry* insert_object(unsigned char *sha1)
59 {
60         unsigned int h = sha1[0] << 8 | sha1[1];
61         struct object_entry *e = table[h];
62         struct object_entry *p = 0;
63
64         while (e) {
65                 if (!memcmp(sha1, e->sha1, sizeof(e->sha1)))
66                         return e;
67                 p = e;
68                 e = e->next;
69         }
70
71         e = new_object(sha1);
72         e->next = 0;
73         e->offset = 0;
74         if (p)
75                 p->next = e;
76         else
77                 table[h] = e;
78         return e;
79 }
80
81 static ssize_t yread(int fd, void *buffer, size_t length)
82 {
83         ssize_t ret = 0;
84         while (ret < length) {
85                 ssize_t size = xread(fd, (char *) buffer + ret, length - ret);
86                 if (size < 0) {
87                         return size;
88                 }
89                 if (size == 0) {
90                         return ret;
91                 }
92                 ret += size;
93         }
94         return ret;
95 }
96
97 static ssize_t ywrite(int fd, void *buffer, size_t length)
98 {
99         ssize_t ret = 0;
100         while (ret < length) {
101                 ssize_t size = xwrite(fd, (char *) buffer + ret, length - ret);
102                 if (size < 0) {
103                         return size;
104                 }
105                 if (size == 0) {
106                         return ret;
107                 }
108                 ret += size;
109         }
110         return ret;
111 }
112
113 static unsigned long encode_header(enum object_type type, unsigned long size, unsigned char *hdr)
114 {
115         int n = 1;
116         unsigned char c;
117
118         if (type < OBJ_COMMIT || type > OBJ_DELTA)
119                 die("bad type %d", type);
120
121         c = (type << 4) | (size & 15);
122         size >>= 4;
123         while (size) {
124                 *hdr++ = c | 0x80;
125                 c = size & 0x7f;
126                 size >>= 7;
127                 n++;
128         }
129         *hdr = c;
130         return n;
131 }
132
133 static void write_blob(void *dat, unsigned long datlen)
134 {
135         z_stream s;
136         void *out, *delta;
137         unsigned char hdr[64];
138         unsigned long hdrlen, deltalen;
139
140         if (lastdat && current_depth < max_depth) {
141                 delta = diff_delta(lastdat, lastdatlen,
142                         dat, datlen,
143                         &deltalen, 0);
144         } else
145                 delta = 0;
146
147         memset(&s, 0, sizeof(s));
148         deflateInit(&s, zlib_compression_level);
149
150         if (delta) {
151                 current_depth++;
152                 s.next_in = delta;
153                 s.avail_in = deltalen;
154                 hdrlen = encode_header(OBJ_DELTA, deltalen, hdr);
155                 if (ywrite(packfd, hdr, hdrlen) != hdrlen)
156                         die("Can't write object header: %s", strerror(errno));
157                 if (ywrite(packfd, lastsha1, sizeof(lastsha1)) != sizeof(lastsha1))
158                         die("Can't write object base: %s", strerror(errno));
159                 packoff += hdrlen + sizeof(lastsha1);
160         } else {
161                 current_depth = 0;
162                 s.next_in = dat;
163                 s.avail_in = datlen;
164                 hdrlen = encode_header(OBJ_BLOB, datlen, hdr);
165                 if (ywrite(packfd, hdr, hdrlen) != hdrlen)
166                         die("Can't write object header: %s", strerror(errno));
167                 packoff += hdrlen;
168         }
169
170         s.avail_out = deflateBound(&s, s.avail_in);
171         s.next_out = out = xmalloc(s.avail_out);
172         while (deflate(&s, Z_FINISH) == Z_OK)
173                 /* nothing */;
174         deflateEnd(&s);
175
176         if (ywrite(packfd, out, s.total_out) != s.total_out)
177                 die("Failed writing compressed data %s", strerror(errno));
178         packoff += s.total_out;
179
180         free(out);
181         if (delta)
182                 free(delta);
183 }
184
185 static void init_pack_header()
186 {
187         const char* magic = "PACK";
188         unsigned long version = 2;
189         unsigned long zero = 0;
190
191         version = htonl(version);
192
193         if (ywrite(packfd, (char*)magic, 4) != 4)
194                 die("Can't write pack magic: %s", strerror(errno));
195         if (ywrite(packfd, &version, 4) != 4)
196                 die("Can't write pack version: %s", strerror(errno));
197         if (ywrite(packfd, &zero, 4) != 4)
198                 die("Can't write 0 object count: %s", strerror(errno));
199         packoff = 4 * 3;
200 }
201
202 static void fixup_header_footer()
203 {
204         SHA_CTX c;
205         char hdr[8];
206         unsigned long cnt;
207         char *buf;
208         size_t n;
209
210         if (lseek(packfd, 0, SEEK_SET) != 0)
211                 die("Failed seeking to start: %s", strerror(errno));
212
213         SHA1_Init(&c);
214         if (yread(packfd, hdr, 8) != 8)
215                 die("Failed reading header: %s", strerror(errno));
216         SHA1_Update(&c, hdr, 8);
217
218         cnt = htonl(object_count);
219         SHA1_Update(&c, &cnt, 4);
220         if (ywrite(packfd, &cnt, 4) != 4)
221                 die("Failed writing object count: %s", strerror(errno));
222
223         buf = xmalloc(128 * 1024);
224         for (;;) {
225                 n = xread(packfd, buf, 128 * 1024);
226                 if (n <= 0)
227                         break;
228                 SHA1_Update(&c, buf, n);
229         }
230         free(buf);
231
232         SHA1_Final(packsha1, &c);
233         if (ywrite(packfd, packsha1, sizeof(packsha1)) != sizeof(packsha1))
234                 die("Failed writing pack checksum: %s", strerror(errno));
235 }
236
237 static int oecmp (const void *_a, const void *_b)
238 {
239         struct object_entry *a = *((struct object_entry**)_a);
240         struct object_entry *b = *((struct object_entry**)_b);
241         return memcmp(a->sha1, b->sha1, sizeof(a->sha1));
242 }
243
244 static void write_index(const char *idx_name)
245 {
246         struct sha1file *f;
247         struct object_entry **idx, **c, **last;
248         struct object_entry *e;
249         struct overflow_object_entry *o;
250         unsigned int array[256];
251         int i;
252
253         /* Build the sorted table of object IDs. */
254         idx = xmalloc(object_count * sizeof(struct object_entry*));
255         c = idx;
256         for (e = pool_start; e != pool_next; e++)
257                 *c++ = e;
258         for (o = overflow; o; o = o->next)
259                 *c++ = &o->oe;
260         last = idx + object_count;
261         qsort(idx, object_count, sizeof(struct object_entry*), oecmp);
262
263         /* Generate the fan-out array. */
264         c = idx;
265         for (i = 0; i < 256; i++) {
266                 struct object_entry **next = c;;
267                 while (next < last) {
268                         if ((*next)->sha1[0] != i)
269                                 break;
270                         next++;
271                 }
272                 array[i] = htonl(next - idx);
273                 c = next;
274         }
275
276         f = sha1create("%s", idx_name);
277         sha1write(f, array, 256 * sizeof(int));
278         for (c = idx; c != last; c++) {
279                 unsigned int offset = htonl((*c)->offset);
280                 sha1write(f, &offset, 4);
281                 sha1write(f, (*c)->sha1, sizeof((*c)->sha1));
282         }
283         sha1write(f, packsha1, sizeof(packsha1));
284         sha1close(f, NULL, 1);
285         free(idx);
286 }
287
288 int main(int argc, const char **argv)
289 {
290         const char *base_name = argv[1];
291         int est_obj_cnt = atoi(argv[2]);
292         char *pack_name;
293         char *idx_name;
294
295         pack_name = xmalloc(strlen(base_name) + 6);
296         sprintf(pack_name, "%s.pack", base_name);
297         idx_name = xmalloc(strlen(base_name) + 5);
298         sprintf(idx_name, "%s.idx", base_name);
299
300         packfd = open(pack_name, O_RDWR|O_CREAT|O_TRUNC, 0666);
301         if (packfd < 0)
302                 die("Can't create pack file %s: %s", pack_name, strerror(errno));
303
304         pool_start = xmalloc(est_obj_cnt * sizeof(struct object_entry));
305         pool_next = pool_start;
306         pool_end = pool_start + est_obj_cnt;
307
308         init_pack_header();
309         for (;;) {
310                 unsigned long datlen;
311                 int hdrlen;
312                 void *dat;
313                 char hdr[128];
314                 unsigned char sha1[20];
315                 SHA_CTX c;
316                 struct object_entry *e;
317
318                 if (yread(0, &datlen, 4) != 4)
319
320                         break;
321
322                 dat = xmalloc(datlen);
323                 if (yread(0, dat, datlen) != datlen)
324                         break;
325
326                 hdrlen = sprintf(hdr, "blob %lu", datlen) + 1;
327                 SHA1_Init(&c);
328                 SHA1_Update(&c, hdr, hdrlen);
329                 SHA1_Update(&c, dat, datlen);
330                 SHA1_Final(sha1, &c);
331
332                 e = insert_object(sha1);
333                 if (!e->offset) {
334                         e->offset = packoff;
335                         write_blob(dat, datlen);
336                         object_count++;
337                         printf("%s\n", sha1_to_hex(sha1));
338                         fflush(stdout);
339
340                         if (lastdat)
341                                 free(lastdat);
342                         lastdat = dat;
343                         lastdatlen = datlen;
344                         memcpy(lastsha1, sha1, sizeof(sha1));
345                 } else {
346                         duplicate_count++;
347                         free(dat);
348                 }
349         }
350         fixup_header_footer();
351         close(packfd);
352         write_index(idx_name);
353
354         fprintf(stderr, "%lu objects, %lu duplicates, %lu pool overflow\n",
355                 object_count, duplicate_count, overflow_count);
356
357         return 0;
358 }