2  * diff-delta.c: generate a delta between two buffers
 
   4  * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
 
   5  * http://www.xmailserver.org/xdiff-lib.html
 
   7  * Rewritten for GIT by Nicolas Pitre <nico@fluxnic.net>, (C) 2005-2007
 
   9  * This code is free software; you can redistribute it and/or modify
 
  10  * it under the terms of the GNU General Public License version 2 as
 
  11  * published by the Free Software Foundation.
 
  14 #include "git-compat-util.h"
 
  17 /* maximum hash entry list for the same hash bucket */
 
  20 #define RABIN_SHIFT 23
 
  21 #define RABIN_WINDOW 16
 
  23 static const unsigned int T[256] = {
 
  24         0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
 
  25         0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
 
  26         0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
 
  27         0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
 
  28         0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
 
  29         0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
 
  30         0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
 
  31         0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
 
  32         0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
 
  33         0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
 
  34         0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
 
  35         0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
 
  36         0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
 
  37         0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
 
  38         0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
 
  39         0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
 
  40         0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
 
  41         0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
 
  42         0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
 
  43         0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
 
  44         0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
 
  45         0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
 
  46         0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
 
  47         0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
 
  48         0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
 
  49         0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
 
  50         0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
 
  51         0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
 
  52         0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
 
  53         0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
 
  54         0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
 
  55         0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
 
  56         0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
 
  57         0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
 
  58         0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
 
  59         0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
 
  60         0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
 
  61         0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
 
  62         0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
 
  63         0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
 
  64         0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
 
  65         0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
 
  66         0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
 
  69 static const unsigned int U[256] = {
 
  70         0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
 
  71         0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
 
  72         0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
 
  73         0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
 
  74         0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
 
  75         0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
 
  76         0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
 
  77         0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
 
  78         0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
 
  79         0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
 
  80         0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
 
  81         0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
 
  82         0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
 
  83         0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
 
  84         0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
 
  85         0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
 
  86         0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
 
  87         0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
 
  88         0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
 
  89         0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
 
  90         0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
 
  91         0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
 
  92         0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
 
  93         0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
 
  94         0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
 
  95         0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
 
  96         0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
 
  97         0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
 
  98         0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
 
  99         0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
 
 100         0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
 
 101         0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
 
 102         0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
 
 103         0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
 
 104         0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
 
 105         0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
 
 106         0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
 
 107         0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
 
 108         0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
 
 109         0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
 
 110         0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
 
 111         0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
 
 112         0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
 
 116         const unsigned char *ptr;
 
 120 struct unpacked_index_entry {
 
 121         struct index_entry entry;
 
 122         struct unpacked_index_entry *next;
 
 126         unsigned long memsize;
 
 128         unsigned long src_size;
 
 129         unsigned int hash_mask;
 
 130         struct index_entry *hash[FLEX_ARRAY];
 
 133 struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
 
 135         unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
 
 136         const unsigned char *data, *buffer = buf;
 
 137         struct delta_index *index;
 
 138         struct unpacked_index_entry *entry, **hash;
 
 139         struct index_entry *packed_entry, **packed_hash;
 
 141         unsigned long memsize;
 
 143         if (!buf || !bufsize)
 
 146         /* Determine index hash size.  Note that indexing skips the
 
 147            first byte to allow for optimizing the Rabin's polynomial
 
 148            initialization in create_delta(). */
 
 149         entries = (bufsize - 1) / RABIN_WINDOW;
 
 150         if (bufsize >= 0xffffffffUL) {
 
 152                  * Current delta format can't encode offsets into
 
 153                  * reference buffer with more than 32 bits.
 
 155                 entries = 0xfffffffeU / RABIN_WINDOW;
 
 158         for (i = 4; (1u << i) < hsize; i++);
 
 162         /* allocate lookup index */
 
 163         memsize = sizeof(*hash) * hsize +
 
 164                   sizeof(*entry) * entries;
 
 165         mem = malloc(memsize);
 
 172         memset(hash, 0, hsize * sizeof(*hash));
 
 174         /* allocate an array to count hash entries */
 
 175         hash_count = calloc(hsize, sizeof(*hash_count));
 
 181         /* then populate the index */
 
 183         for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
 
 185              data -= RABIN_WINDOW) {
 
 186                 unsigned int val = 0;
 
 187                 for (i = 1; i <= RABIN_WINDOW; i++)
 
 188                         val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
 
 189                 if (val == prev_val) {
 
 190                         /* keep the lowest of consecutive identical blocks */
 
 191                         entry[-1].entry.ptr = data + RABIN_WINDOW;
 
 196                         entry->entry.ptr = data + RABIN_WINDOW;
 
 197                         entry->entry.val = val;
 
 198                         entry->next = hash[i];
 
 205          * Determine a limit on the number of entries in the same hash
 
 206          * bucket.  This guards us against pathological data sets causing
 
 207          * really bad hash distribution with most entries in the same hash
 
 208          * bucket that would bring us to O(m*n) computing costs (m and n
 
 209          * corresponding to reference and target buffer sizes).
 
 211          * Make sure none of the hash buckets has more entries than
 
 212          * we're willing to test.  Otherwise we cull the entry list
 
 213          * uniformly to still preserve a good repartition across
 
 214          * the reference buffer.
 
 216         for (i = 0; i < hsize; i++) {
 
 219                 if (hash_count[i] <= HASH_LIMIT)
 
 222                 /* We leave exactly HASH_LIMIT entries in the bucket */
 
 223                 entries -= hash_count[i] - HASH_LIMIT;
 
 229                  * Assume that this loop is gone through exactly
 
 230                  * HASH_LIMIT times and is entered and left with
 
 231                  * acc==0.  So the first statement in the loop
 
 232                  * contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
 
 233                  * to the accumulator, and the inner loop consequently
 
 234                  * is run (hash_count[i]-HASH_LIMIT) times, removing
 
 235                  * one element from the list each time.  Since acc
 
 236                  * balances out to 0 at the final run, the inner loop
 
 237                  * body can't be left with entry==NULL.  So we indeed
 
 238                  * encounter entry==NULL in the outer loop only.
 
 241                         acc += hash_count[i] - HASH_LIMIT;
 
 243                                 struct unpacked_index_entry *keep = entry;
 
 248                                 keep->next = entry->next;
 
 256          * Now create the packed index in array form
 
 257          * rather than linked lists.
 
 259         memsize = sizeof(*index)
 
 260                 + sizeof(*packed_hash) * (hsize+1)
 
 261                 + sizeof(*packed_entry) * entries;
 
 262         mem = malloc(memsize);
 
 269         index->memsize = memsize;
 
 270         index->src_buf = buf;
 
 271         index->src_size = bufsize;
 
 272         index->hash_mask = hmask;
 
 276         mem = packed_hash + (hsize+1);
 
 279         for (i = 0; i < hsize; i++) {
 
 281                  * Coalesce all entries belonging to one linked list
 
 282                  * into consecutive array entries.
 
 284                 packed_hash[i] = packed_entry;
 
 285                 for (entry = hash[i]; entry; entry = entry->next)
 
 286                         *packed_entry++ = entry->entry;
 
 289         /* Sentinel value to indicate the length of the last hash bucket */
 
 290         packed_hash[hsize] = packed_entry;
 
 292         assert(packed_entry - (struct index_entry *)mem == entries);
 
 298 void free_delta_index(struct delta_index *index)
 
 303 unsigned long sizeof_delta_index(struct delta_index *index)
 
 306                 return index->memsize;
 
 312  * The maximum size for any opcode sequence, including the initial header
 
 313  * plus Rabin window plus biggest copy.
 
 315 #define MAX_OP_SIZE     (5 + 5 + 1 + RABIN_WINDOW + 7)
 
 318 create_delta(const struct delta_index *index,
 
 319              const void *trg_buf, unsigned long trg_size,
 
 320              unsigned long *delta_size, unsigned long max_size)
 
 322         unsigned int i, outpos, outsize, moff, msize, val;
 
 324         const unsigned char *ref_data, *ref_top, *data, *top;
 
 327         if (!trg_buf || !trg_size)
 
 332         if (max_size && outsize >= max_size)
 
 333                 outsize = max_size + MAX_OP_SIZE + 1;
 
 334         out = malloc(outsize);
 
 338         /* store reference buffer size */
 
 341                 out[outpos++] = i | 0x80;
 
 346         /* store target buffer size */
 
 349                 out[outpos++] = i | 0x80;
 
 354         ref_data = index->src_buf;
 
 355         ref_top = ref_data + index->src_size;
 
 357         top = (const unsigned char *) trg_buf + trg_size;
 
 361         for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
 
 362                 out[outpos++] = *data;
 
 363                 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 
 371                         struct index_entry *entry;
 
 372                         val ^= U[data[-RABIN_WINDOW]];
 
 373                         val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
 
 374                         i = val & index->hash_mask;
 
 375                         for (entry = index->hash[i]; entry < index->hash[i+1]; entry++) {
 
 376                                 const unsigned char *ref = entry->ptr;
 
 377                                 const unsigned char *src = data;
 
 378                                 unsigned int ref_size = ref_top - ref;
 
 379                                 if (entry->val != val)
 
 381                                 if (ref_size > top - src)
 
 382                                         ref_size = top - src;
 
 383                                 if (ref_size <= msize)
 
 385                                 while (ref_size-- && *src++ == *ref)
 
 387                                 if (msize < ref - entry->ptr) {
 
 388                                         /* this is our best match so far */
 
 389                                         msize = ref - entry->ptr;
 
 390                                         moff = entry->ptr - ref_data;
 
 391                                         if (msize >= 4096) /* good enough */
 
 400                         out[outpos++] = *data++;
 
 402                         if (inscnt == 0x7f) {
 
 403                                 out[outpos - inscnt - 1] = inscnt;
 
 412                                 while (moff && ref_data[moff-1] == data[-1]) {
 
 413                                         /* we can match one byte back */
 
 420                                         outpos--;  /* remove count slot */
 
 421                                         inscnt--;  /* make it -1 */
 
 424                                 out[outpos - inscnt - 1] = inscnt;
 
 428                         /* A copy op is currently limited to 64KB (pack v2) */
 
 429                         left = (msize < 0x10000) ? 0 : (msize - 0x10000);
 
 435                         if (moff & 0x000000ff)
 
 436                                 out[outpos++] = moff >> 0,  i |= 0x01;
 
 437                         if (moff & 0x0000ff00)
 
 438                                 out[outpos++] = moff >> 8,  i |= 0x02;
 
 439                         if (moff & 0x00ff0000)
 
 440                                 out[outpos++] = moff >> 16, i |= 0x04;
 
 441                         if (moff & 0xff000000)
 
 442                                 out[outpos++] = moff >> 24, i |= 0x08;
 
 445                                 out[outpos++] = msize >> 0, i |= 0x10;
 
 447                                 out[outpos++] = msize >> 8, i |= 0x20;
 
 458                                 for (j = -RABIN_WINDOW; j < 0; j++)
 
 459                                         val = ((val << 8) | data[j])
 
 460                                               ^ T[val >> RABIN_SHIFT];
 
 464                 if (outpos >= outsize - MAX_OP_SIZE) {
 
 466                         outsize = outsize * 3 / 2;
 
 467                         if (max_size && outsize >= max_size)
 
 468                                 outsize = max_size + MAX_OP_SIZE + 1;
 
 469                         if (max_size && outpos > max_size)
 
 471                         out = realloc(out, outsize);
 
 480                 out[outpos - inscnt - 1] = inscnt;
 
 482         if (max_size && outpos > max_size) {
 
 487         *delta_size = outpos;