2 * diff-delta.c: generate a delta between two buffers
4 * Many parts of this file have been lifted from LibXDiff version 0.10.
5 * http://www.xmailserver.org/xdiff-lib.html
7 * LibXDiff was written by Davide Libenzi <davidel@xmailserver.org>
8 * Copyright (C) 2003 Davide Libenzi
10 * Many mods for GIT usage by Nicolas Pitre <nico@cam.org>, (C) 2005.
12 * This file is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
17 * Use of this within git automatically means that the LGPL
18 * licensing gets turned into GPLv2 within this project.
21 #include "git-compat-util.h"
24 /* maximum hash entry list for the same hash bucket */
27 #define RABIN_SHIFT 23
28 #define RABIN_WINDOW 16
30 static const unsigned int T[256] = {
31 0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
32 0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
33 0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
34 0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
35 0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
36 0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
37 0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
38 0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
39 0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
40 0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
41 0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
42 0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
43 0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
44 0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
45 0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
46 0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
47 0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
48 0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
49 0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
50 0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
51 0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
52 0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
53 0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
54 0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
55 0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
56 0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
57 0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
58 0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
59 0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
60 0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
61 0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
62 0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
63 0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
64 0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
65 0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
66 0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
67 0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
68 0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
69 0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
70 0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
71 0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
72 0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
73 0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
76 static const unsigned int U[256] = {
77 0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
78 0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
79 0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
80 0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
81 0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
82 0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
83 0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
84 0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
85 0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
86 0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
87 0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
88 0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
89 0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
90 0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
91 0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
92 0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
93 0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
94 0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
95 0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
96 0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
97 0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
98 0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
99 0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
100 0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
101 0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
102 0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
103 0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
104 0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
105 0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
106 0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
107 0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
108 0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
109 0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
110 0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
111 0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
112 0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
113 0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
114 0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
115 0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
116 0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
117 0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
118 0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
119 0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
123 const unsigned char *ptr;
125 struct index_entry *next;
130 unsigned long src_size;
131 unsigned int hash_mask;
132 struct index_entry *hash[FLEX_ARRAY];
135 struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
137 unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
138 const unsigned char *data, *buffer = buf;
139 struct delta_index *index;
140 struct index_entry *entry, **hash;
142 unsigned long memsize;
144 if (!buf || !bufsize)
147 /* Determine index hash size. Note that indexing skips the
148 first byte to allow for optimizing the Rabin's polynomial
149 initialization in create_delta(). */
150 entries = (bufsize - 1) / RABIN_WINDOW;
152 for (i = 4; (1u << i) < hsize && i < 31; i++);
156 /* allocate lookup index */
157 memsize = sizeof(*index) +
158 sizeof(*hash) * hsize +
159 sizeof(*entry) * entries;
160 mem = malloc(memsize);
169 index->src_buf = buf;
170 index->src_size = bufsize;
171 index->hash_mask = hmask;
172 memset(hash, 0, hsize * sizeof(*hash));
174 /* allocate an array to count hash entries */
175 hash_count = calloc(hsize, sizeof(*hash_count));
181 /* then populate the index */
183 for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
185 data -= RABIN_WINDOW) {
186 unsigned int val = 0;
187 for (i = 1; i <= RABIN_WINDOW; i++)
188 val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
189 if (val == prev_val) {
190 /* keep the lowest of consecutive identical blocks */
191 entry[-1].ptr = data + RABIN_WINDOW;
195 entry->ptr = data + RABIN_WINDOW;
197 entry->next = hash[i];
204 * Determine a limit on the number of entries in the same hash
205 * bucket. This guards us against pathological data sets causing
206 * really bad hash distribution with most entries in the same hash
207 * bucket that would bring us to O(m*n) computing costs (m and n
208 * corresponding to reference and target buffer sizes).
210 * Make sure none of the hash buckets has more entries than
211 * we're willing to test. Otherwise we cull the entry list
212 * uniformly to still preserve a good repartition across
213 * the reference buffer.
215 for (i = 0; i < hsize; i++) {
216 if (hash_count[i] < HASH_LIMIT)
220 struct index_entry *keep = entry;
221 int skip = hash_count[i] / HASH_LIMIT / 2;
224 } while(--skip && entry);
233 void free_delta_index(struct delta_index *index)
239 * The maximum size for any opcode sequence, including the initial header
240 * plus Rabin window plus biggest copy.
242 #define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
245 create_delta(const struct delta_index *index,
246 const void *trg_buf, unsigned long trg_size,
247 unsigned long *delta_size, unsigned long max_size)
249 unsigned int i, outpos, outsize, val;
251 const unsigned char *ref_data, *ref_top, *data, *top;
254 if (!trg_buf || !trg_size)
259 if (max_size && outsize >= max_size)
260 outsize = max_size + MAX_OP_SIZE + 1;
261 out = malloc(outsize);
265 /* store reference buffer size */
268 out[outpos++] = i | 0x80;
273 /* store target buffer size */
276 out[outpos++] = i | 0x80;
281 ref_data = index->src_buf;
282 ref_top = ref_data + index->src_size;
284 top = (const unsigned char *) trg_buf + trg_size;
288 for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
289 out[outpos++] = *data;
290 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
295 unsigned int moff = 0, msize = 0;
296 struct index_entry *entry;
297 val ^= U[data[-RABIN_WINDOW]];
298 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
299 i = val & index->hash_mask;
300 for (entry = index->hash[i]; entry; entry = entry->next) {
301 const unsigned char *ref = entry->ptr;
302 const unsigned char *src = data;
303 unsigned int ref_size = ref_top - ref;
304 if (entry->val != val)
306 if (ref_size > top - src)
307 ref_size = top - src;
308 if (ref_size > 0x10000)
310 if (ref_size <= msize)
312 while (ref_size-- && *src++ == *ref)
314 if (msize < ref - entry->ptr) {
315 /* this is our best match so far */
316 msize = ref - entry->ptr;
317 moff = entry->ptr - ref_data;
324 out[outpos++] = *data++;
326 if (inscnt == 0x7f) {
327 out[outpos - inscnt - 1] = inscnt;
333 if (msize >= RABIN_WINDOW) {
334 const unsigned char *sk;
335 sk = data + msize - RABIN_WINDOW;
337 for (i = 0; i < RABIN_WINDOW; i++)
338 val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
340 const unsigned char *sk = data + 1;
341 for (i = 1; i < msize; i++) {
342 val ^= U[sk[-RABIN_WINDOW]];
343 val = ((val << 8) | *sk++) ^ T[val >> RABIN_SHIFT];
348 while (moff && ref_data[moff-1] == data[-1]) {
349 if (msize == 0x10000)
351 /* we can match one byte back */
358 outpos--; /* remove count slot */
359 inscnt--; /* make it -1 */
362 out[outpos - inscnt - 1] = inscnt;
370 if (moff & 0xff) { out[outpos++] = moff; i |= 0x01; }
372 if (moff & 0xff) { out[outpos++] = moff; i |= 0x02; }
374 if (moff & 0xff) { out[outpos++] = moff; i |= 0x04; }
376 if (moff & 0xff) { out[outpos++] = moff; i |= 0x08; }
378 if (msize & 0xff) { out[outpos++] = msize; i |= 0x10; }
380 if (msize & 0xff) { out[outpos++] = msize; i |= 0x20; }
385 if (outpos >= outsize - MAX_OP_SIZE) {
387 outsize = outsize * 3 / 2;
388 if (max_size && outsize >= max_size)
389 outsize = max_size + MAX_OP_SIZE + 1;
390 if (max_size && outpos > max_size)
392 out = xrealloc(out, outsize);
401 out[outpos - inscnt - 1] = inscnt;
403 if (max_size && outpos > max_size) {
408 *delta_size = outpos;