2  * diff-delta.c: generate a delta between two buffers
 
   4  * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
 
   5  * http://www.xmailserver.org/xdiff-lib.html
 
   7  * Rewritten for GIT by Nicolas Pitre <nico@fluxnic.net>, (C) 2005-2007
 
   9  * This code is free software; you can redistribute it and/or modify
 
  10  * it under the terms of the GNU General Public License version 2 as
 
  11  * published by the Free Software Foundation.
 
  14 #include "git-compat-util.h"
 
  17 /* maximum hash entry list for the same hash bucket */
 
  20 #define RABIN_SHIFT 23
 
  21 #define RABIN_WINDOW 16
 
  23 static const unsigned int T[256] = {
 
  24         0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
 
  25         0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
 
  26         0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
 
  27         0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
 
  28         0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
 
  29         0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
 
  30         0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
 
  31         0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
 
  32         0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
 
  33         0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
 
  34         0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
 
  35         0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
 
  36         0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
 
  37         0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
 
  38         0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
 
  39         0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
 
  40         0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
 
  41         0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
 
  42         0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
 
  43         0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
 
  44         0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
 
  45         0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
 
  46         0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
 
  47         0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
 
  48         0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
 
  49         0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
 
  50         0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
 
  51         0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
 
  52         0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
 
  53         0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
 
  54         0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
 
  55         0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
 
  56         0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
 
  57         0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
 
  58         0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
 
  59         0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
 
  60         0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
 
  61         0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
 
  62         0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
 
  63         0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
 
  64         0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
 
  65         0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
 
  66         0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
 
  69 static const unsigned int U[256] = {
 
  70         0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
 
  71         0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
 
  72         0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
 
  73         0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
 
  74         0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
 
  75         0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
 
  76         0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
 
  77         0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
 
  78         0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
 
  79         0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
 
  80         0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
 
  81         0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
 
  82         0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
 
  83         0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
 
  84         0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
 
  85         0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
 
  86         0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
 
  87         0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
 
  88         0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
 
  89         0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
 
  90         0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
 
  91         0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
 
  92         0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
 
  93         0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
 
  94         0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
 
  95         0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
 
  96         0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
 
  97         0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
 
  98         0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
 
  99         0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
 
 100         0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
 
 101         0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
 
 102         0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
 
 103         0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
 
 104         0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
 
 105         0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
 
 106         0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
 
 107         0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
 
 108         0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
 
 109         0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
 
 110         0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
 
 111         0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
 
 112         0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
 
 116         const unsigned char *ptr;
 
 120 struct unpacked_index_entry {
 
 121         struct index_entry entry;
 
 122         struct unpacked_index_entry *next;
 
 126         unsigned long memsize;
 
 128         unsigned long src_size;
 
 129         unsigned int hash_mask;
 
 130         struct index_entry *hash[FLEX_ARRAY];
 
 133 struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 
 135         unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
 
 136         const unsigned char *data, *buffer = buf;
 
 137         struct delta_index *index;
 
 138         struct unpacked_index_entry *entry, **hash;
 
 139         struct index_entry *packed_entry, **packed_hash;
 
 141         unsigned long memsize;
 
 143         if (!buf || !bufsize)
 
 146         /* Determine index hash size.  Note that indexing skips the
 
 147            first byte to allow for optimizing the Rabin's polynomial
 
 148            initialization in create_delta(). */
 
 149         entries = (bufsize - 1) / RABIN_WINDOW;
 
 150         if (bufsize >= 0xffffffffUL) {
 
 152                  * Current delta format can't encode offsets into
 
 153                  * reference buffer with more than 32 bits.
 
 155                 entries = 0xfffffffeU / RABIN_WINDOW;
 
 158         for (i = 4; (1u << i) < hsize; i++);
 
 162         /* allocate lookup index */
 
 163         memsize = sizeof(*hash) * hsize +
 
 164                   sizeof(*entry) * entries;
 
 165         mem = malloc(memsize);
 
 172         memset(hash, 0, hsize * sizeof(*hash));
 
 174         /* allocate an array to count hash entries */
 
 175         hash_count = calloc(hsize, sizeof(*hash_count));
 
 181         /* then populate the index */
 
 183         for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
 
 185              data -= RABIN_WINDOW) {
 
 186                 unsigned int val = 0;
 
 187                 for (i = 1; i <= RABIN_WINDOW; i++)
 
 188                         val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
 
 189                 if (val == prev_val) {
 
 190                         /* keep the lowest of consecutive identical blocks */
 
 191                         entry[-1].entry.ptr = data + RABIN_WINDOW;
 
 196                         entry->entry.ptr = data + RABIN_WINDOW;
 
 197                         entry->entry.val = val;
 
 198                         entry->next = hash[i];
 
 205          * Determine a limit on the number of entries in the same hash
 
 206          * bucket.  This guards us against pathological data sets causing
 
 207          * really bad hash distribution with most entries in the same hash
 
 208          * bucket that would bring us to O(m*n) computing costs (m and n
 
 209          * corresponding to reference and target buffer sizes).
 
 211          * Make sure none of the hash buckets has more entries than
 
 212          * we're willing to test.  Otherwise we cull the entry list
 
 213          * uniformly to still preserve a good repartition across
 
 214          * the reference buffer.
 
 216         for (i = 0; i < hsize; i++) {
 
 219                 if (hash_count[i] <= HASH_LIMIT)
 
 222                 /* We leave exactly HASH_LIMIT entries in the bucket */
 
 223                 entries -= hash_count[i] - HASH_LIMIT;
 
 229                  * Assume that this loop is gone through exactly
 
 230                  * HASH_LIMIT times and is entered and left with
 
 231                  * acc==0.  So the first statement in the loop
 
 232                  * contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
 
 233                  * to the accumulator, and the inner loop consequently
 
 234                  * is run (hash_count[i]-HASH_LIMIT) times, removing
 
 235                  * one element from the list each time.  Since acc
 
 236                  * balances out to 0 at the final run, the inner loop
 
 237                  * body can't be left with entry==NULL.  So we indeed
 
 238                  * encounter entry==NULL in the outer loop only.
 
 241                         acc += hash_count[i] - HASH_LIMIT;
 
 243                                 struct unpacked_index_entry *keep = entry;
 
 248                                 keep->next = entry->next;
 
 256          * Now create the packed index in array form
 
 257          * rather than linked lists.
 
 259         memsize = sizeof(*index)
 
 260                 + sizeof(*packed_hash) * (hsize+1)
 
 261                 + sizeof(*packed_entry) * entries;
 
 262         mem = malloc(memsize);
 
 269         index->memsize = memsize;
 
 270         index->src_buf = buf;
 
 271         index->src_size = bufsize;
 
 272         index->hash_mask = hmask;
 
 276         mem = packed_hash + (hsize+1);
 
 279         for (i = 0; i < hsize; i++) {
 
 281                  * Coalesce all entries belonging to one linked list
 
 282                  * into consecutive array entries.
 
 284                 packed_hash[i] = packed_entry;
 
 285                 for (entry = hash[i]; entry; entry = entry->next)
 
 286                         *packed_entry++ = entry->entry;
 
 289         /* Sentinel value to indicate the length of the last hash bucket */
 
 290         packed_hash[hsize] = packed_entry;
 
 292         assert(packed_entry - (struct index_entry *)mem == entries);
 
 298 void free_delta_index(struct delta_index *index)
 
 303 unsigned long sizeof_delta_index(struct delta_index *index)
 
 306                 return index->memsize;
 
 312  * The maximum size for any opcode sequence, including the initial header
 
 313  * plus Rabin window plus biggest copy.
 
 315 #define MAX_OP_SIZE     (5 + 5 + 1 + RABIN_WINDOW + 7)
 
 318 create_delta(const struct delta_index *index,
 
 319              const void *trg_buf, unsigned long trg_size,
 
 320              unsigned long *delta_size, unsigned long max_size)
 
 324         size_t l, outsize, msize;
 
 326         const unsigned char *ref_data, *ref_top, *data, *top;
 
 331         if (!trg_buf || !trg_size)
 
 336         if (max_size && outsize >= max_size)
 
 337                 outsize = max_size + MAX_OP_SIZE + 1;
 
 338         out = malloc(outsize);
 
 342         /* store reference buffer size */
 
 345                 out[outpos++] = l | 0x80;
 
 350         /* store target buffer size */
 
 353                 out[outpos++] = l | 0x80;
 
 358         ref_data = index->src_buf;
 
 359         ref_top = ref_data + index->src_size;
 
 361         top = (const unsigned char *) trg_buf + trg_size;
 
 365         for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
 
 366                 out[outpos++] = *data;
 
 367                 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 
 375                         struct index_entry *entry;
 
 376                         val ^= U[data[-RABIN_WINDOW]];
 
 377                         val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 
 378                         i = val & index->hash_mask;
 
 379                         for (entry = index->hash[i]; entry < index->hash[i+1]; entry++) {
 
 380                                 const unsigned char *ref = entry->ptr;
 
 381                                 const unsigned char *src = data;
 
 382                                 unsigned int ref_size = ref_top - ref;
 
 383                                 if (entry->val != val)
 
 385                                 if (ref_size > top - src)
 
 386                                         ref_size = top - src;
 
 387                                 if (ref_size <= msize)
 
 389                                 while (ref_size-- && *src++ == *ref)
 
 391                                 if (msize < ref - entry->ptr) {
 
 392                                         /* this is our best match so far */
 
 393                                         msize = ref - entry->ptr;
 
 394                                         moff = entry->ptr - ref_data;
 
 395                                         if (msize >= 4096) /* good enough */
 
 404                         out[outpos++] = *data++;
 
 406                         if (inscnt == 0x7f) {
 
 407                                 out[outpos - inscnt - 1] = inscnt;
 
 416                                 while (moff && ref_data[moff-1] == data[-1]) {
 
 417                                         /* we can match one byte back */
 
 424                                         outpos--;  /* remove count slot */
 
 425                                         inscnt--;  /* make it -1 */
 
 428                                 out[outpos - inscnt - 1] = inscnt;
 
 432                         /* A copy op is currently limited to 64KB (pack v2) */
 
 433                         left = (msize < 0x10000) ? 0 : (msize - 0x10000);
 
 439                         if (moff & 0x000000ff)
 
 440                                 out[outpos++] = moff >> 0,  i |= 0x01;
 
 441                         if (moff & 0x0000ff00)
 
 442                                 out[outpos++] = moff >> 8,  i |= 0x02;
 
 443                         if (moff & 0x00ff0000)
 
 444                                 out[outpos++] = moff >> 16, i |= 0x04;
 
 445                         if (moff & 0xff000000)
 
 446                                 out[outpos++] = moff >> 24, i |= 0x08;
 
 449                                 out[outpos++] = msize >> 0, i |= 0x10;
 
 451                                 out[outpos++] = msize >> 8, i |= 0x20;
 
 459                         if (moff > 0xffffffff)
 
 465                                 for (j = -RABIN_WINDOW; j < 0; j++)
 
 466                                         val = ((val << 8) | data[j])
 
 467                                               ^ T[val >> RABIN_SHIFT];
 
 471                 if (outpos >= outsize - MAX_OP_SIZE) {
 
 473                         outsize = outsize * 3 / 2;
 
 474                         if (max_size && outsize >= max_size)
 
 475                                 outsize = max_size + MAX_OP_SIZE + 1;
 
 476                         if (max_size && outpos > max_size)
 
 478                         out = realloc(out, outsize);
 
 487                 out[outpos - inscnt - 1] = inscnt;
 
 489         if (max_size && outpos > max_size) {
 
 494         *delta_size = outpos;