ce_match_stat, run_diff_files: use symbolic constants for readability
[git] / diff-delta.c
1 /*
2  * diff-delta.c: generate a delta between two buffers
3  *
4  * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
5  * http://www.xmailserver.org/xdiff-lib.html
6  *
7  * Rewritten for GIT by Nicolas Pitre <nico@cam.org>, (C) 2005-2007
8  *
9  * This code is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  */
13
14 #include "git-compat-util.h"
15 #include "delta.h"
16
17 /* maximum hash entry list for the same hash bucket */
18 #define HASH_LIMIT 64
19
20 #define RABIN_SHIFT 23
21 #define RABIN_WINDOW 16
22
23 static const unsigned int T[256] = {
24         0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
25         0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
26         0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
27         0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
28         0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
29         0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
30         0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
31         0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
32         0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
33         0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
34         0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
35         0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
36         0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
37         0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
38         0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
39         0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
40         0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
41         0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
42         0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
43         0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
44         0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
45         0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
46         0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
47         0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
48         0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
49         0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
50         0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
51         0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
52         0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
53         0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
54         0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
55         0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
56         0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
57         0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
58         0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
59         0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
60         0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
61         0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
62         0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
63         0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
64         0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
65         0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
66         0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
67 };
68
69 static const unsigned int U[256] = {
70         0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
71         0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
72         0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
73         0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
74         0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
75         0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
76         0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
77         0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
78         0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
79         0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
80         0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
81         0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
82         0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
83         0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
84         0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
85         0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
86         0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
87         0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
88         0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
89         0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
90         0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
91         0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
92         0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
93         0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
94         0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
95         0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
96         0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
97         0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
98         0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
99         0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
100         0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
101         0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
102         0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
103         0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
104         0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
105         0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
106         0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
107         0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
108         0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
109         0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
110         0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
111         0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
112         0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
113 };
114
115 struct index_entry {
116         const unsigned char *ptr;
117         unsigned int val;
118         struct index_entry *next;
119 };
120
121 struct delta_index {
122         unsigned long memsize;
123         const void *src_buf;
124         unsigned long src_size;
125         unsigned int hash_mask;
126         struct index_entry *hash[FLEX_ARRAY];
127 };
128
129 struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
130 {
131         unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
132         const unsigned char *data, *buffer = buf;
133         struct delta_index *index;
134         struct index_entry *entry, **hash;
135         void *mem;
136         unsigned long memsize;
137
138         if (!buf || !bufsize)
139                 return NULL;
140
141         /* Determine index hash size.  Note that indexing skips the
142            first byte to allow for optimizing the Rabin's polynomial
143            initialization in create_delta(). */
144         entries = (bufsize - 1)  / RABIN_WINDOW;
145         hsize = entries / 4;
146         for (i = 4; (1u << i) < hsize && i < 31; i++);
147         hsize = 1 << i;
148         hmask = hsize - 1;
149
150         /* allocate lookup index */
151         memsize = sizeof(*index) +
152                   sizeof(*hash) * hsize +
153                   sizeof(*entry) * entries;
154         mem = malloc(memsize);
155         if (!mem)
156                 return NULL;
157         index = mem;
158         mem = index + 1;
159         hash = mem;
160         mem = hash + hsize;
161         entry = mem;
162
163         index->memsize = memsize;
164         index->src_buf = buf;
165         index->src_size = bufsize;
166         index->hash_mask = hmask;
167         memset(hash, 0, hsize * sizeof(*hash));
168
169         /* allocate an array to count hash entries */
170         hash_count = calloc(hsize, sizeof(*hash_count));
171         if (!hash_count) {
172                 free(index);
173                 return NULL;
174         }
175
176         /* then populate the index */
177         prev_val = ~0;
178         for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
179              data >= buffer;
180              data -= RABIN_WINDOW) {
181                 unsigned int val = 0;
182                 for (i = 1; i <= RABIN_WINDOW; i++)
183                         val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
184                 if (val == prev_val) {
185                         /* keep the lowest of consecutive identical blocks */
186                         entry[-1].ptr = data + RABIN_WINDOW;
187                 } else {
188                         prev_val = val;
189                         i = val & hmask;
190                         entry->ptr = data + RABIN_WINDOW;
191                         entry->val = val;
192                         entry->next = hash[i];
193                         hash[i] = entry++;
194                         hash_count[i]++;
195                 }
196         }
197
198         /*
199          * Determine a limit on the number of entries in the same hash
200          * bucket.  This guards us against pathological data sets causing
201          * really bad hash distribution with most entries in the same hash
202          * bucket that would bring us to O(m*n) computing costs (m and n
203          * corresponding to reference and target buffer sizes).
204          *
205          * Make sure none of the hash buckets has more entries than
206          * we're willing to test.  Otherwise we cull the entry list
207          * uniformly to still preserve a good repartition across
208          * the reference buffer.
209          */
210         for (i = 0; i < hsize; i++) {
211                 if (hash_count[i] < HASH_LIMIT)
212                         continue;
213                 entry = hash[i];
214                 do {
215                         struct index_entry *keep = entry;
216                         int skip = hash_count[i] / HASH_LIMIT;
217                         do {
218                                 entry = entry->next;
219                         } while(--skip && entry);
220                         keep->next = entry;
221                 } while(entry);
222         }
223         free(hash_count);
224
225         return index;
226 }
227
228 void free_delta_index(struct delta_index *index)
229 {
230         free(index);
231 }
232
233 unsigned long sizeof_delta_index(struct delta_index *index)
234 {
235         if (index)
236                 return index->memsize;
237         else
238                 return 0;
239 }
240
241 /*
242  * The maximum size for any opcode sequence, including the initial header
243  * plus Rabin window plus biggest copy.
244  */
245 #define MAX_OP_SIZE     (5 + 5 + 1 + RABIN_WINDOW + 7)
246
247 void *
248 create_delta(const struct delta_index *index,
249              const void *trg_buf, unsigned long trg_size,
250              unsigned long *delta_size, unsigned long max_size)
251 {
252         unsigned int i, outpos, outsize, moff, msize, val;
253         int inscnt;
254         const unsigned char *ref_data, *ref_top, *data, *top;
255         unsigned char *out;
256
257         if (!trg_buf || !trg_size)
258                 return NULL;
259
260         outpos = 0;
261         outsize = 8192;
262         if (max_size && outsize >= max_size)
263                 outsize = max_size + MAX_OP_SIZE + 1;
264         out = malloc(outsize);
265         if (!out)
266                 return NULL;
267
268         /* store reference buffer size */
269         i = index->src_size;
270         while (i >= 0x80) {
271                 out[outpos++] = i | 0x80;
272                 i >>= 7;
273         }
274         out[outpos++] = i;
275
276         /* store target buffer size */
277         i = trg_size;
278         while (i >= 0x80) {
279                 out[outpos++] = i | 0x80;
280                 i >>= 7;
281         }
282         out[outpos++] = i;
283
284         ref_data = index->src_buf;
285         ref_top = ref_data + index->src_size;
286         data = trg_buf;
287         top = (const unsigned char *) trg_buf + trg_size;
288
289         outpos++;
290         val = 0;
291         for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
292                 out[outpos++] = *data;
293                 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
294         }
295         inscnt = i;
296
297         moff = 0;
298         msize = 0;
299         while (data < top) {
300                 if (msize < 4096) {
301                         struct index_entry *entry;
302                         val ^= U[data[-RABIN_WINDOW]];
303                         val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
304                         i = val & index->hash_mask;
305                         for (entry = index->hash[i]; entry; entry = entry->next) {
306                                 const unsigned char *ref = entry->ptr;
307                                 const unsigned char *src = data;
308                                 unsigned int ref_size = ref_top - ref;
309                                 if (entry->val != val)
310                                         continue;
311                                 if (ref_size > top - src)
312                                         ref_size = top - src;
313                                 if (ref_size <= msize)
314                                         break;
315                                 while (ref_size-- && *src++ == *ref)
316                                         ref++;
317                                 if (msize < ref - entry->ptr) {
318                                         /* this is our best match so far */
319                                         msize = ref - entry->ptr;
320                                         moff = entry->ptr - ref_data;
321                                         if (msize >= 4096) /* good enough */
322                                                 break;
323                                 }
324                         }
325                 }
326
327                 if (msize < 4) {
328                         if (!inscnt)
329                                 outpos++;
330                         out[outpos++] = *data++;
331                         inscnt++;
332                         if (inscnt == 0x7f) {
333                                 out[outpos - inscnt - 1] = inscnt;
334                                 inscnt = 0;
335                         }
336                         msize = 0;
337                 } else {
338                         unsigned int left;
339                         unsigned char *op;
340
341                         if (inscnt) {
342                                 while (moff && ref_data[moff-1] == data[-1]) {
343                                         /* we can match one byte back */
344                                         msize++;
345                                         moff--;
346                                         data--;
347                                         outpos--;
348                                         if (--inscnt)
349                                                 continue;
350                                         outpos--;  /* remove count slot */
351                                         inscnt--;  /* make it -1 */
352                                         break;
353                                 }
354                                 out[outpos - inscnt - 1] = inscnt;
355                                 inscnt = 0;
356                         }
357
358                         /* A copy op is currently limited to 64KB (pack v2) */
359                         left = (msize < 0x10000) ? 0 : (msize - 0x10000);
360                         msize -= left;
361
362                         op = out + outpos++;
363                         i = 0x80;
364
365                         if (moff & 0x000000ff)
366                                 out[outpos++] = moff >> 0,  i |= 0x01;
367                         if (moff & 0x0000ff00)
368                                 out[outpos++] = moff >> 8,  i |= 0x02;
369                         if (moff & 0x00ff0000)
370                                 out[outpos++] = moff >> 16, i |= 0x04;
371                         if (moff & 0xff000000)
372                                 out[outpos++] = moff >> 24, i |= 0x08;
373
374                         if (msize & 0x00ff)
375                                 out[outpos++] = msize >> 0, i |= 0x10;
376                         if (msize & 0xff00)
377                                 out[outpos++] = msize >> 8, i |= 0x20;
378
379                         *op = i;
380
381                         data += msize;
382                         moff += msize;
383                         msize = left;
384
385                         if (msize < 4096) {
386                                 int j;
387                                 val = 0;
388                                 for (j = -RABIN_WINDOW; j < 0; j++)
389                                         val = ((val << 8) | data[j])
390                                               ^ T[val >> RABIN_SHIFT];
391                         }
392                 }
393
394                 if (outpos >= outsize - MAX_OP_SIZE) {
395                         void *tmp = out;
396                         outsize = outsize * 3 / 2;
397                         if (max_size && outsize >= max_size)
398                                 outsize = max_size + MAX_OP_SIZE + 1;
399                         if (max_size && outpos > max_size)
400                                 break;
401                         out = realloc(out, outsize);
402                         if (!out) {
403                                 free(tmp);
404                                 return NULL;
405                         }
406                 }
407         }
408
409         if (inscnt)
410                 out[outpos - inscnt - 1] = inscnt;
411
412         if (max_size && outpos > max_size) {
413                 free(out);
414                 return NULL;
415         }
416
417         *delta_size = outpos;
418         return out;
419 }