1 /* This header defines OpenCL data types that can handle the immensity
2 * of possible lines/pages/books in the Library of Babel */
10 If we encode each symbol (index) in 5 bits (2^5 = 32 > 25), we get 8 symbols
11 every 5 bytes, and 80 symbols (a line) in 50 bytes. A single line can thus be
12 encoded in 50 uchars, 25 ushorts, 13 uints or 7 ulongs. However, decoding the
13 i-th character requires shifts and/or bitwise ANDs (with 0x1f) across native
14 data types (e.g. vector components). A mess. We don't do that.
16 Assuming we don't want to cross data types, and still we want to use 5 bits per
17 symbol, the encoding possibilities depend on the data type we choose, in the
21 Each uchar encodes a symbol, and wastes 8 - 5 = 3 bits.
22 80 uchars are needed to encode a line. No additional bits are wasted.
23 3_200 uchars are needed to encode a page. No additional bits are wasted.
24 1_312_000 uchars are needed to encode a book. No additional bits are wasted.
27 Each ushort encodes 3 symbols and wastes 16 - 15 = 1 bit.
28 27 ushorts are needed to encode a line. 5 additional bits are wasted.
29 1_066 ushorts are needed to encode a page. 5 additional bits are wasted.
30 437_333 ushorts are needed to encode a book. 10 additional bits are wasted.
33 Each uint encodes 6 symbols and wastes 32 - 30 = 2 bits.
34 14 uints are needed to encode a line. 20 additional bits are wasted.
35 534 uints are needed to encode a page. 20 additional bits are wasted.
36 218_667 uints are needed to encode a book. 10 additional bits are wasted.
39 Each ulong encodes 12 symbols and wastes 64 - 60 = 4 bits.
40 7 ulongs are needed to encode a line. 20 additional bits are wasted.
41 267 ulongs are needed to encode a page. 20 additional bits are wasted.
42 109_334 ulongs are needed to encode a book. 40 additional bits are wasted.
44 Lines and pages are thus optimally encoded by using ushorts. The optimal
45 encoding for a book would use 437_332 ushorts followed by a uchar. More
46 in general, N symbols would be encoded with ushorts only if N % 3 != 1,
47 and ushorts followed by a uchar otherwise. In the stream, the uchar
48 could be told apart from the ushort by looking at the MSB. In other
49 words, the stream parser would read data one byte a time, decoding it as
50 a uchar or reading one more byte to decode a ushort depending on the
51 highest bit of the byte read.
53 Since this only spares as a single byte, which is not much of a gain in
54 terms of efficiency, we can just drop the whole thing and use ushorts
59 #ifdef __OPENCL_VERSION__
60 typedef ushort packel;
62 typedef cl_ushort packel;
63 typedef cl_uchar uchar;
68 #define SYM_PER_PACKEL 3
70 /* returns a packed element for the up to whatever bytes */
73 pack_els(const uchar *b)
78 for (size_t i = 0; p && i < SYM_PER_PACKEL; ++i, ++p) {
79 ret |= (*p & SYM_MASK) << SYM_SHIFT*i;
86 unpack_els(packel u, uchar *b)
88 for (size_t i = 0; i < SYM_PER_PACKEL; ++i) {
94 #ifndef __OPENCL_VERSION__
98 /* encode the C string str into the ushort array line (if it's not
99 * NULL), returns the number of ushorts needed to encode the whole
103 encode_string(const char* str, packel *line) {
104 size_t maxl = strlen(str);
108 size_t need = (maxl+2)/3;
113 while (base < maxl) {
114 for (size_t i = 0; i < SYM_PER_PACKEL; ++i) {
116 b[i] = strchr(alphabet, str[base+i]) - alphabet;
120 line[offset++] = pack_els(b);
121 base += SYM_PER_PACKEL;
123 if (offset != need) {
124 fprintf(stderr, "encoding error: %lu != %lu\n", offset, need);