git.sesse.net Git - plocate/blob - turbopfor-encode.h

   1 #ifndef _TURBOPFOR_ENCODE_H
   2 #define _TURBOPFOR_ENCODE_H
   3
   4 // Much like turbopfor.h (and shares all of the same caveats), except this is
   5 // for encoding. It is _much_ slower than the reference implementation, but we
   6 // encode only during build, and most time in build is spent in other things
   7 // than encoding posting lists, so it only costs ~5-10% overall. Does not use
   8 // any special instruction sets, and generally isn't optimized at all.
   9 //
  10 // It encodes roughly about as dense as the reference encoder.
  11
  12 #include "turbopfor-common.h"
  13
  14 #include <algorithm>
  15 #include <assert.h>
  16 #ifdef HAS_ENDIAN_H
  17 #include <endian.h>
  18 #endif
  19 #include <limits.h>
  20 #include <stdint.h>
  21 #include <string.h>
  22
  23 template<class Docid>
  24 void write_le(Docid val, void *out)
  25 {
  26         if constexpr (sizeof(Docid) == 8) {
  27                 val = htole64(val);
  28         } else if constexpr (sizeof(Docid) == 4) {
  29                 val = htole32(val);
  30         } else if constexpr (sizeof(Docid) == 2) {
  31                 val = htole16(val);
  32         } else if constexpr (sizeof(Docid) == 1) {
  33                 // No change.
  34         } else {
  35                 assert(false);
  36         }
  37         memcpy(out, &val, sizeof(val));
  38 }
  39
  40 // Corresponds to read_baseval.
  41 template<class Docid>
  42 unsigned char *write_baseval(Docid in, unsigned char *out)
  43 {
  44         if (in < 128) {
  45                 *out = in;
  46                 return out + 1;
  47         } else if (in < 0x4000) {
  48                 out[0] = (in >> 8) | 0x80;
  49                 out[1] = in & 0xff;
  50                 return out + 2;
  51         } else if (in < 0x200000) {
  52                 out[0] = (in >> 16) | 0xc0;
  53                 out[1] = in & 0xff;
  54                 out[2] = (in >> 8) & 0xff;
  55                 return out + 3;
  56         } else if (in < 0x10000000) {
  57                 out[0] = (in >> 24) | 0xe0;
  58                 out[1] = (in >> 16) & 0xff;
  59                 out[2] = (in >> 8) & 0xff;
  60                 out[3] = in & 0xff;
  61                 return out + 4;
  62         } else {
  63                 assert(false);  // Not implemented.
  64         }
  65 }
  66
  67 // Writes a varbyte-encoded exception.
  68 template<class Docid>
  69 unsigned char *write_vb(Docid val, unsigned char *out)
  70 {
  71         if (val <= 176) {
  72                 *out++ = val;
  73                 return out;
  74         } else if (val <= 16560) {
  75                 val -= 177;
  76                 *out++ = (val >> 8) + 177;
  77                 *out++ = val & 0xff;
  78                 return out;
  79         } else if (val <= 540848) {
  80                 val -= 16561;
  81                 *out = (val >> 16) + 241;
  82                 write_le<uint16_t>(val & 0xffff, out + 1);
  83                 return out + 3;
  84         } else if (val <= 16777215) {
  85                 *out = 249;
  86                 write_le<uint32_t>(val, out + 1);
  87                 return out + 4;
  88         } else {
  89                 *out = 250;
  90                 write_le<uint32_t>(val, out + 1);
  91                 return out + 5;
  92         }
  93 }
  94
  95 template<class Docid>
  96 inline unsigned num_bits(Docid x)
  97 {
  98 #ifdef __GNUC__
  99         if (x == 0) {
 100                 return 0;
 101         } else {
 102                 return sizeof(Docid) * CHAR_BIT - __builtin_clz(x);
 103         }
 104 #else
 105         for (int i = sizeof(Docid) * CHAR_BIT; i-- > 0;) {
 106                 if (x & (Docid{ 1 } << i)) {
 107                         return i;
 108                 }
 109         }
 110         return 0;
 111 #endif
 112 }
 113
 114 struct BitWriter {
 115 public:
 116         BitWriter(unsigned char *out, unsigned bits)
 117                 : out(out), bits(bits) {}
 118         void write(uint32_t val)
 119         {
 120                 cur_val |= val << bits_used;
 121                 write_le<uint32_t>(cur_val, out);
 122
 123                 bits_used += bits;
 124                 cur_val >>= (bits_used / 8) * 8;
 125                 out += bits_used / 8;
 126                 bits_used %= 8;
 127         }
 128
 129 private:
 130         unsigned char *out;
 131         const unsigned bits;
 132         unsigned bits_used = 0;
 133         unsigned cur_val = 0;
 134 };
 135
 136 template<unsigned NumStreams>
 137 struct InterleavedBitWriter {
 138 public:
 139         InterleavedBitWriter(unsigned char *out, unsigned bits)
 140                 : out(out), bits(bits) {}
 141         void write(uint32_t val)
 142         {
 143                 cur_val |= uint64_t(val) << bits_used;
 144                 if (bits_used + bits >= 32) {
 145                         write_le<uint32_t>(cur_val & 0xffffffff, out);
 146                         out += Stride;
 147                         cur_val >>= 32;
 148                         bits_used -= 32;  // Underflow, but will be fixed below.
 149                 }
 150                 write_le<uint32_t>(cur_val, out);
 151                 bits_used += bits;
 152         }
 153
 154 private:
 155         static constexpr unsigned Stride = NumStreams * sizeof(uint32_t);
 156         unsigned char *out;
 157         const unsigned bits;
 158         unsigned bits_used = 0;
 159         uint64_t cur_val = 0;
 160 };
 161
 162 // Bitpacks a set of values (making sure the top bits are lopped off).
 163 // If interleaved is set, makes SSE2-compatible interleaving (this is
 164 // only allowed for full blocks).
 165 template<class Docid>
 166 unsigned char *encode_bitmap(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out)
 167 {
 168         unsigned mask = mask_for_bits(bit_width);
 169         if (interleaved) {
 170                 InterleavedBitWriter<4> bs0(out + 0 * sizeof(uint32_t), bit_width);
 171                 InterleavedBitWriter<4> bs1(out + 1 * sizeof(uint32_t), bit_width);
 172                 InterleavedBitWriter<4> bs2(out + 2 * sizeof(uint32_t), bit_width);
 173                 InterleavedBitWriter<4> bs3(out + 3 * sizeof(uint32_t), bit_width);
 174                 assert(num % 4 == 0);
 175                 for (unsigned i = 0; i < num / 4; ++i) {
 176                         bs0.write(in[i * 4 + 0] & mask);
 177                         bs1.write(in[i * 4 + 1] & mask);
 178                         bs2.write(in[i * 4 + 2] & mask);
 179                         bs3.write(in[i * 4 + 3] & mask);
 180                 }
 181         } else {
 182                 BitWriter bs(out, bit_width);
 183                 for (unsigned i = 0; i < num; ++i) {
 184                         bs.write(in[i] & mask);
 185                 }
 186         }
 187         return out + bytes_for_packed_bits(num, bit_width);
 188 }
 189
 190 // See decode_for() for the format.
 191 template<class Docid>
 192 unsigned char *encode_for(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out)
 193 {
 194         return encode_bitmap(in, num, bit_width, interleaved, out);
 195 }
 196
 197 // See decode_pfor_bitmap() for the format.
 198 template<class Docid>
 199 unsigned char *encode_pfor_bitmap(const Docid *in, unsigned num, unsigned bit_width, unsigned exception_bit_width, bool interleaved, unsigned char *out)
 200 {
 201         *out++ = exception_bit_width;
 202
 203         // Bitmap of exceptions.
 204         {
 205                 BitWriter bs(out, 1);
 206                 for (unsigned i = 0; i < num; ++i) {
 207                         bs.write((in[i] >> bit_width) != 0);
 208                 }
 209                 out += bytes_for_packed_bits(num, 1);
 210         }
 211
 212         // Exceptions.
 213         {
 214                 BitWriter bs(out, exception_bit_width);
 215                 unsigned num_exceptions = 0;
 216                 for (unsigned i = 0; i < num; ++i) {
 217                         if ((in[i] >> bit_width) != 0) {
 218                                 bs.write(in[i] >> bit_width);
 219                                 ++num_exceptions;
 220                         }
 221                 }
 222                 out += bytes_for_packed_bits(num_exceptions, exception_bit_width);
 223         }
 224
 225         // Base values.
 226         out = encode_bitmap(in, num, bit_width, interleaved, out);
 227
 228         return out;
 229 }
 230
 231 // See decode_pfor_vb() for the format.
 232 template<class Docid>
 233 unsigned char *encode_pfor_vb(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out)
 234 {
 235         unsigned num_exceptions = 0;
 236         for (unsigned i = 0; i < num; ++i) {
 237                 if ((in[i] >> bit_width) != 0) {
 238                         ++num_exceptions;
 239                 }
 240         }
 241         *out++ = num_exceptions;
 242
 243         // Base values.
 244         out = encode_bitmap(in, num, bit_width, interleaved, out);
 245
 246         // Exceptions.
 247         for (unsigned i = 0; i < num; ++i) {
 248                 unsigned val = in[i] >> bit_width;
 249                 if (val != 0) {
 250                         out = write_vb(val, out);
 251                 }
 252         }
 253
 254         // Exception indexes.
 255         for (unsigned i = 0; i < num; ++i) {
 256                 unsigned val = in[i] >> bit_width;
 257                 if (val != 0) {
 258                         *out++ = i;
 259                 }
 260         }
 261
 262         return out;
 263 }
 264
 265 // Find out which block type would be the smallest for the given data.
 266 template<class Docid>
 267 BlockType decide_block_type(const Docid *in, unsigned num, unsigned *bit_width, unsigned *exception_bit_width)
 268 {
 269         // Check if the block is constant.
 270         bool constant = true;
 271         for (unsigned i = 1; i < num; ++i) {
 272                 if (in[i] != in[0]) {
 273                         constant = false;
 274                         break;
 275                 }
 276         }
 277         if (constant) {
 278                 *bit_width = num_bits(in[0]);
 279                 return BlockType::CONSTANT;
 280         }
 281
 282         // Build up a histogram of bit sizes.
 283         unsigned histogram[sizeof(Docid) * CHAR_BIT + 1] = { 0 };
 284         unsigned max_bits = 0;
 285         for (unsigned i = 0; i < num; ++i) {
 286                 unsigned bits = num_bits(in[i]);
 287                 ++histogram[bits];
 288                 max_bits = std::max(max_bits, bits);
 289         }
 290
 291         // Straight-up FOR.
 292         unsigned best_cost = bytes_for_packed_bits(num, max_bits);
 293         unsigned best_bit_width = max_bits;
 294
 295         // Try PFOR with bitmap exceptions.
 296         const unsigned bitmap_cost = bytes_for_packed_bits(num, 1);
 297         unsigned num_exceptions = 0;
 298         for (unsigned exception_bit_width = 1; exception_bit_width <= max_bits; ++exception_bit_width) {
 299                 unsigned test_bit_width = max_bits - exception_bit_width;
 300                 num_exceptions += histogram[test_bit_width + 1];
 301
 302                 // 1 byte for signaling exception bit width, then the bitmap,
 303                 // then the base values, then the exceptions.
 304                 unsigned cost = 1 + bitmap_cost + bytes_for_packed_bits(num, test_bit_width) +
 305                         bytes_for_packed_bits(num_exceptions, exception_bit_width);
 306                 if (cost < best_cost) {
 307                         best_cost = cost;
 308                         best_bit_width = test_bit_width;
 309                 }
 310         }
 311
 312         // Make the histogram cumulative; histogram[n] now means the the number of
 313         // elements with n bits or more.
 314         for (unsigned i = max_bits; i > 0; --i) {
 315                 histogram[i - 1] += histogram[i];
 316         }
 317
 318         // Try PFOR with varbyte exceptions.
 319         bool best_is_varbyte = false;
 320         for (unsigned test_bit_width = 0; test_bit_width < max_bits; ++test_bit_width) {
 321                 // 1 byte for signaling number of exceptions, plus the base values,
 322                 // and then we estimate up the varbytes and indexes using histogram
 323                 // indexes. This isn't exact, but it only helps ~0.1% on the total
 324                 // plocate.db size.
 325                 unsigned cost = 1 + bytes_for_packed_bits(num, test_bit_width);
 326                 if (cost >= best_cost) {
 327                         break;
 328                 }
 329                 if (test_bit_width + 1 <= max_bits) {
 330                         cost += 2 * histogram[test_bit_width + 1];  // > 0.
 331                         if (test_bit_width + 7 <= max_bits) {
 332                                 cost += histogram[test_bit_width + 7];  // > 176, very roughly.
 333                                 if (test_bit_width + 14 <= max_bits) {
 334                                         cost += histogram[test_bit_width + 14];  // > 16560, very roughly.
 335                                         if (test_bit_width + 19 <= max_bits) {
 336                                                 cost += histogram[test_bit_width + 19];  // > 540848, very roughly.
 337                                                 if (test_bit_width + 24 <= max_bits) {
 338                                                         cost += histogram[test_bit_width + 24];  // > 16777215, very roughly.
 339                                                 }
 340                                         }
 341                                 }
 342                         }
 343                 }
 344                 if (cost < best_cost) {
 345                         best_cost = cost;
 346                         best_bit_width = test_bit_width;
 347                         best_is_varbyte = true;
 348                 }
 349         }
 350
 351         // TODO: Consider the last-resort option of just raw storage (255).
 352
 353         if (best_is_varbyte) {
 354                 *bit_width = best_bit_width;
 355                 return BlockType::PFOR_VB;
 356         } else if (best_bit_width == max_bits) {
 357                 *bit_width = max_bits;
 358                 return BlockType::FOR;
 359         } else {
 360                 *bit_width = best_bit_width;
 361                 *exception_bit_width = max_bits - best_bit_width;
 362                 return BlockType::PFOR_BITMAP;
 363         }
 364 }
 365
 366 // The basic entry point. Takes one block of integers (which already must
 367 // be delta-minus-1-encoded) and packs it into TurboPFor format.
 368 // interleaved corresponds to the interleaved parameter in decode_pfor_delta1()
 369 // or the “128v” infix in the reference code's function names; such formats
 370 // are much faster to decode, so for full blocks, you probably want it.
 371 // The interleaved flag isn't stored anywhere; it's implicit whether you
 372 // want to use it for full blocks or not.
 373 //
 374 // The first value must already be written using write_baseval() (so the delta
 375 // coding starts from the second value). Returns the end of the string.
 376 // May write 4 bytes past the end.
 377 template<unsigned BlockSize, class Docid>
 378 unsigned char *encode_pfor_single_block(const Docid *in, unsigned num, bool interleaved, unsigned char *out)
 379 {
 380         assert(num > 0);
 381         if (interleaved) {
 382                 assert(num == BlockSize);
 383         }
 384
 385         unsigned bit_width, exception_bit_width;
 386         BlockType block_type = decide_block_type(in, num, &bit_width, &exception_bit_width);
 387         *out++ = (block_type << 6) | bit_width;
 388
 389         switch (block_type) {
 390         case BlockType::CONSTANT: {
 391                 unsigned bit_width = num_bits(in[0]);
 392                 write_le<Docid>(in[0], out);
 393                 return out + div_round_up(bit_width, 8);
 394         }
 395         case BlockType::FOR:
 396                 return encode_for(in, num, bit_width, interleaved, out);
 397         case BlockType::PFOR_BITMAP:
 398                 return encode_pfor_bitmap(in, num, bit_width, exception_bit_width, interleaved, out);
 399         case BlockType::PFOR_VB:
 400                 return encode_pfor_vb(in, num, bit_width, interleaved, out);
 401         default:
 402                 assert(false);
 403         }
 404 }
 405
 406 #endif  // !defined(_TURBOPFOR_ENCODE_H)