add add_object_array_with_mode
[git] / diff-delta.c
1 /*
2  * diff-delta.c: generate a delta between two buffers
3  *
4  *  Many parts of this file have been lifted from LibXDiff version 0.10.
5  *  http://www.xmailserver.org/xdiff-lib.html
6  *
7  *  LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
8  *  Copyright (C) 2003  Davide Libenzi
9  *
10  *  Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
11  *
12  *  This file is free software; you can redistribute it and/or
13  *  modify it under the terms of the GNU Lesser General Public
14  *  License as published by the Free Software Foundation; either
15  *  version 2.1 of the License, or (at your option) any later version.
16  *
17  *  Use of this within git automatically means that the LGPL
18  *  licensing gets turned into GPLv2 within this project.
19  */
20
21 #include "git-compat-util.h"
22 #include "delta.h"
23
24 /* maximum hash entry list for the same hash bucket */
25 #define HASH_LIMIT 64
26
27 #define RABIN_SHIFT 23
28 #define RABIN_WINDOW 16
29
30 static const unsigned int T[256] = {
31         0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
32         0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
33         0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
34         0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
35         0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
36         0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
37         0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
38         0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
39         0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
40         0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
41         0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
42         0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
43         0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
44         0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
45         0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
46         0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
47         0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
48         0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
49         0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
50         0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
51         0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
52         0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
53         0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
54         0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
55         0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
56         0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
57         0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
58         0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
59         0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
60         0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
61         0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
62         0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
63         0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
64         0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
65         0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
66         0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
67         0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
68         0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
69         0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
70         0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
71         0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
72         0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
73         0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
74 };
75
76 static const unsigned int U[256] = {
77         0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
78         0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
79         0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
80         0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
81         0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
82         0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
83         0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
84         0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
85         0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
86         0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
87         0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
88         0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
89         0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
90         0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
91         0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
92         0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
93         0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
94         0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
95         0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
96         0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
97         0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
98         0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
99         0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
100         0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
101         0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
102         0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
103         0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
104         0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
105         0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
106         0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
107         0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
108         0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
109         0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
110         0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
111         0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
112         0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
113         0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
114         0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
115         0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
116         0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
117         0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
118         0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
119         0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
120 };
121
122 struct index_entry {
123         const unsigned char *ptr;
124         unsigned int val;
125         struct index_entry *next;
126 };
127
128 struct delta_index {
129         const void *src_buf;
130         unsigned long src_size;
131         unsigned int hash_mask;
132         struct index_entry *hash[FLEX_ARRAY];
133 };
134
135 struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
136 {
137         unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
138         const unsigned char *data, *buffer = buf;
139         struct delta_index *index;
140         struct index_entry *entry, **hash;
141         void *mem;
142         unsigned long memsize;
143
144         if (!buf || !bufsize)
145                 return NULL;
146
147         /* Determine index hash size.  Note that indexing skips the
148            first byte to allow for optimizing the Rabin's polynomial
149            initialization in create_delta(). */
150         entries = (bufsize - 1)  / RABIN_WINDOW;
151         hsize = entries / 4;
152         for (i = 4; (1u << i) < hsize && i < 31; i++);
153         hsize = 1 << i;
154         hmask = hsize - 1;
155
156         /* allocate lookup index */
157         memsize = sizeof(*index) +
158                   sizeof(*hash) * hsize +
159                   sizeof(*entry) * entries;
160         mem = malloc(memsize);
161         if (!mem)
162                 return NULL;
163         index = mem;
164         mem = index + 1;
165         hash = mem;
166         mem = hash + hsize;
167         entry = mem;
168
169         index->src_buf = buf;
170         index->src_size = bufsize;
171         index->hash_mask = hmask;
172         memset(hash, 0, hsize * sizeof(*hash));
173
174         /* allocate an array to count hash entries */
175         hash_count = calloc(hsize, sizeof(*hash_count));
176         if (!hash_count) {
177                 free(index);
178                 return NULL;
179         }
180
181         /* then populate the index */
182         prev_val = ~0;
183         for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
184              data >= buffer;
185              data -= RABIN_WINDOW) {
186                 unsigned int val = 0;
187                 for (i = 1; i <= RABIN_WINDOW; i++)
188                         val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
189                 if (val == prev_val) {
190                         /* keep the lowest of consecutive identical blocks */
191                         entry[-1].ptr = data + RABIN_WINDOW;
192                 } else {
193                         prev_val = val;
194                         i = val & hmask;
195                         entry->ptr = data + RABIN_WINDOW;
196                         entry->val = val;
197                         entry->next = hash[i];
198                         hash[i] = entry++;
199                         hash_count[i]++;
200                 }
201         }
202
203         /*
204          * Determine a limit on the number of entries in the same hash
205          * bucket.  This guards us against pathological data sets causing
206          * really bad hash distribution with most entries in the same hash
207          * bucket that would bring us to O(m*n) computing costs (m and n
208          * corresponding to reference and target buffer sizes).
209          *
210          * Make sure none of the hash buckets has more entries than
211          * we're willing to test.  Otherwise we cull the entry list
212          * uniformly to still preserve a good repartition across
213          * the reference buffer.
214          */
215         for (i = 0; i < hsize; i++) {
216                 if (hash_count[i] < HASH_LIMIT)
217                         continue;
218                 entry = hash[i];
219                 do {
220                         struct index_entry *keep = entry;
221                         int skip = hash_count[i] / HASH_LIMIT / 2;
222                         do {
223                                 entry = entry->next;
224                         } while(--skip && entry);
225                         keep->next = entry;
226                 } while(entry);
227         }
228         free(hash_count);
229
230         return index;
231 }
232
233 void free_delta_index(struct delta_index *index)
234 {
235         free(index);
236 }
237
238 /*
239  * The maximum size for any opcode sequence, including the initial header
240  * plus Rabin window plus biggest copy.
241  */
242 #define MAX_OP_SIZE     (5 + 5 + 1 + RABIN_WINDOW + 7)
243
244 void *
245 create_delta(const struct delta_index *index,
246              const void *trg_buf, unsigned long trg_size,
247              unsigned long *delta_size, unsigned long max_size)
248 {
249         unsigned int i, outpos, outsize, val;
250         int inscnt;
251         const unsigned char *ref_data, *ref_top, *data, *top;
252         unsigned char *out;
253
254         if (!trg_buf || !trg_size)
255                 return NULL;
256
257         outpos = 0;
258         outsize = 8192;
259         if (max_size && outsize >= max_size)
260                 outsize = max_size + MAX_OP_SIZE + 1;
261         out = malloc(outsize);
262         if (!out)
263                 return NULL;
264
265         /* store reference buffer size */
266         i = index->src_size;
267         while (i >= 0x80) {
268                 out[outpos++] = i | 0x80;
269                 i >>= 7;
270         }
271         out[outpos++] = i;
272
273         /* store target buffer size */
274         i = trg_size;
275         while (i >= 0x80) {
276                 out[outpos++] = i | 0x80;
277                 i >>= 7;
278         }
279         out[outpos++] = i;
280
281         ref_data = index->src_buf;
282         ref_top = ref_data + index->src_size;
283         data = trg_buf;
284         top = (const unsigned char *) trg_buf + trg_size;
285
286         outpos++;
287         val = 0;
288         for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
289                 out[outpos++] = *data;
290                 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
291         }
292         inscnt = i;
293
294         while (data < top) {
295                 unsigned int moff = 0, msize = 0;
296                 struct index_entry *entry;
297                 val ^= U[data[-RABIN_WINDOW]];
298                 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
299                 i = val & index->hash_mask;
300                 for (entry = index->hash[i]; entry; entry = entry->next) {
301                         const unsigned char *ref = entry->ptr;
302                         const unsigned char *src = data;
303                         unsigned int ref_size = ref_top - ref;
304                         if (entry->val != val)
305                                 continue;
306                         if (ref_size > top - src)
307                                 ref_size = top - src;
308                         if (ref_size > 0x10000)
309                                 ref_size = 0x10000;
310                         if (ref_size <= msize)
311                                 break;
312                         while (ref_size-- && *src++ == *ref)
313                                 ref++;
314                         if (msize < ref - entry->ptr) {
315                                 /* this is our best match so far */
316                                 msize = ref - entry->ptr;
317                                 moff = entry->ptr - ref_data;
318                         }
319                 }
320
321                 if (msize < 4) {
322                         if (!inscnt)
323                                 outpos++;
324                         out[outpos++] = *data++;
325                         inscnt++;
326                         if (inscnt == 0x7f) {
327                                 out[outpos - inscnt - 1] = inscnt;
328                                 inscnt = 0;
329                         }
330                 } else {
331                         unsigned char *op;
332
333                         if (msize >= RABIN_WINDOW) {
334                                 const unsigned char *sk;
335                                 sk = data + msize - RABIN_WINDOW;
336                                 val = 0;
337                                 for (i = 0; i < RABIN_WINDOW; i++)
338                                         val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
339                         } else {
340                                 const unsigned char *sk = data + 1;
341                                 for (i = 1; i < msize; i++) {
342                                         val ^= U[sk[-RABIN_WINDOW]];
343                                         val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
344                                 }
345                         }
346
347                         if (inscnt) {
348                                 while (moff && ref_data[moff-1] == data[-1]) {
349                                         if (msize == 0x10000)
350                                                 break;
351                                         /* we can match one byte back */
352                                         msize++;
353                                         moff--;
354                                         data--;
355                                         outpos--;
356                                         if (--inscnt)
357                                                 continue;
358                                         outpos--;  /* remove count slot */
359                                         inscnt--;  /* make it -1 */
360                                         break;
361                                 }
362                                 out[outpos - inscnt - 1] = inscnt;
363                                 inscnt = 0;
364                         }
365
366                         data += msize;
367                         op = out + outpos++;
368                         i = 0x80;
369
370                         if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
371                         moff >>= 8;
372                         if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
373                         moff >>= 8;
374                         if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
375                         moff >>= 8;
376                         if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
377
378                         if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
379                         msize >>= 8;
380                         if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
381
382                         *op = i;
383                 }
384
385                 if (outpos >= outsize - MAX_OP_SIZE) {
386                         void *tmp = out;
387                         outsize = outsize * 3 / 2;
388                         if (max_size && outsize >= max_size)
389                                 outsize = max_size + MAX_OP_SIZE + 1;
390                         if (max_size && outpos > max_size)
391                                 break;
392                         out = xrealloc(out, outsize);
393                         if (!out) {
394                                 free(tmp);
395                                 return NULL;
396                         }
397                 }
398         }
399
400         if (inscnt)
401                 out[outpos - inscnt - 1] = inscnt;
402
403         if (max_size && outpos > max_size) {
404                 free(out);
405                 return NULL;
406         }
407
408         *delta_size = outpos;
409         return out;
410 }