git.sesse.net Git - plocate/blob - turbopfor-encode.h

   1 #ifndef _TURBOPFOR_ENCODE_H
   2 #define _TURBOPFOR_ENCODE_H
   3
   4 // Much like turbopfor.h (and shares all of the same caveats), except this is
   5 // for encoding. It is _much_ slower than the reference implementation, but we
   6 // encode only during build, and most time in build is spent in other things
   7 // than encoding posting lists, so it only costs ~5-10% overall. Does not use
   8 // any special character sets, and generally isn't optimized at all.
   9 //
  10 // It encodes about 0.01% denser than the reference encoder (averaged over
  11 // a real plocate corpus), probably since it has a slower but more precise
  12 // method for estimating the cost of a PFOR + varbyte block.
  13
  14 #include "turbopfor-common.h"
  15
  16 #include <algorithm>
  17 #include <assert.h>
  18 #include <limits.h>
  19 #include <stdint.h>
  20 #include <string.h>
  21
  22 template<class Docid>
  23 void write_le(Docid val, void *out)
  24 {
  25         if constexpr (sizeof(Docid) == 8) {
  26                 val = htole64(val);
  27         } else if constexpr (sizeof(Docid) == 4) {
  28                 val = htole32(val);
  29         } else if constexpr (sizeof(Docid) == 2) {
  30                 val = htole16(val);
  31         } else if constexpr (sizeof(Docid) == 1) {
  32                 // No change.
  33         } else {
  34                 assert(false);
  35         }
  36         memcpy(out, &val, sizeof(val));
  37 }
  38
  39 // Corresponds to read_baseval.
  40 template<class Docid>
  41 unsigned char *write_baseval(Docid in, unsigned char *out)
  42 {
  43         if (in < 128) {
  44                 *out = in;
  45                 return out + 1;
  46         } else if (in < 0x4000) {
  47                 out[0] = (in >> 8) | 0x80;
  48                 out[1] = in & 0xff;
  49                 return out + 2;
  50         } else if (in < 0x200000) {
  51                 out[0] = (in >> 16) | 0xc0;
  52                 out[1] = in & 0xff;
  53                 out[2] = (in >> 8) & 0xff;
  54                 return out + 3;
  55         } else if (in < 0x10000000) {
  56                 out[0] = (in >> 24) | 0xe0;
  57                 out[1] = (in >> 16) & 0xff;
  58                 out[2] = (in >> 8) & 0xff;
  59                 out[3] = in & 0xff;
  60                 return out + 4;
  61         } else {
  62                 assert(false);  // Not implemented.
  63         }
  64 }
  65
  66 // Writes a varbyte-encoded exception.
  67 template<class Docid>
  68 unsigned char *write_vb(Docid val, unsigned char *out)
  69 {
  70         if (val <= 176) {
  71                 *out++ = val;
  72                 return out;
  73         } else if (val <= 16560) {
  74                 val -= 177;
  75                 *out++ = (val >> 8) + 177;
  76                 *out++ = val & 0xff;
  77                 return out;
  78         } else if (val <= 540848) {
  79                 val -= 16561;
  80                 *out = (val >> 16) + 241;
  81                 write_le<uint16_t>(val & 0xffff, out + 1);
  82                 return out + 3;
  83         } else if (val <= 16777215) {
  84                 *out = 249;
  85                 write_le<uint32_t>(val, out + 1);
  86                 return out + 4;
  87         } else {
  88                 *out = 250;
  89                 write_le<uint32_t>(val, out + 1);
  90                 return out + 5;
  91         }
  92 }
  93
  94 template<class Docid>
  95 inline unsigned num_bits(Docid x)
  96 {
  97 #ifdef __GNUC__
  98         if (x == 0) {
  99                 return 0;
 100         } else {
 101                 return sizeof(Docid) * CHAR_BIT - __builtin_clz(x);
 102         }
 103 #else
 104         for (int i = sizeof(Docid) * CHAR_BIT; i-- > 0; ) {
 105                 if (x & (Docid{1} << i)) {
 106                         return i;
 107                 }
 108         }
 109         return 0;
 110 #endif
 111 }
 112
 113 struct BitWriter {
 114 public:
 115         BitWriter(unsigned char *out, unsigned bits)
 116                 : out(out), bits(bits) {}
 117         void write(uint32_t val)
 118         {
 119                 cur_val |= val << bits_used;
 120                 write_le<uint32_t>(cur_val, out);
 121
 122                 bits_used += bits;
 123                 cur_val >>= (bits_used / 8) * 8;
 124                 out += bits_used / 8;
 125                 bits_used %= 8;
 126         }
 127
 128 private:
 129         unsigned char *out;
 130         const unsigned bits;
 131         unsigned bits_used = 0;
 132         unsigned cur_val = 0;
 133 };
 134
 135 template<unsigned NumStreams>
 136 struct InterleavedBitWriter {
 137 public:
 138         InterleavedBitWriter(unsigned char *out, unsigned bits)
 139                 : out(out), bits(bits) {}
 140         void write(uint32_t val)
 141         {
 142                 cur_val |= uint64_t(val) << bits_used;
 143                 if (bits_used + bits >= 32) {
 144                         write_le<uint32_t>(cur_val & 0xffffffff, out);
 145                         out += Stride;
 146                         cur_val >>= 32;
 147                         bits_used -= 32;  // Underflow, but will be fixed below.
 148                 }
 149                 write_le<uint32_t>(cur_val, out);
 150                 bits_used += bits;
 151         }
 152
 153 private:
 154         static constexpr unsigned Stride = NumStreams * sizeof(uint32_t);
 155         unsigned char *out;
 156         const unsigned bits;
 157         unsigned bits_used = 0;
 158         uint64_t cur_val = 0;
 159 };
 160
 161 // Bitpacks a set of values (making sure the top bits are lopped off).
 162 // If interleaved is set, makes SSE2-compatible interleaving (this is
 163 // only allowed for full blocks).
 164 template<class Docid>
 165 unsigned char *encode_bitmap(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out)
 166 {
 167         unsigned mask = mask_for_bits(bit_width);
 168         if (interleaved) {
 169                 InterleavedBitWriter<4> bs0(out + 0 * sizeof(uint32_t), bit_width);
 170                 InterleavedBitWriter<4> bs1(out + 1 * sizeof(uint32_t), bit_width);
 171                 InterleavedBitWriter<4> bs2(out + 2 * sizeof(uint32_t), bit_width);
 172                 InterleavedBitWriter<4> bs3(out + 3 * sizeof(uint32_t), bit_width);
 173                 assert(num % 4 == 0);
 174                 for (unsigned i = 0; i < num / 4; ++i) {
 175                         bs0.write(in[i * 4 + 0] & mask);
 176                         bs1.write(in[i * 4 + 1] & mask);
 177                         bs2.write(in[i * 4 + 2] & mask);
 178                         bs3.write(in[i * 4 + 3] & mask);
 179                 }
 180         } else {
 181                 BitWriter bs(out, bit_width);
 182                 for (unsigned i = 0; i < num; ++i) {
 183                         bs.write(in[i] & mask);
 184                 }
 185         }
 186         return out + bytes_for_packed_bits(num, bit_width);
 187 }
 188
 189 // See decode_for() for the format.
 190 template<class Docid>
 191 unsigned char *encode_for(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out)
 192 {
 193         return encode_bitmap(in, num, bit_width, interleaved, out);
 194 }
 195
 196 // See decode_pfor_bitmap() for the format.
 197 template<class Docid>
 198 unsigned char *encode_pfor_bitmap(const Docid *in, unsigned num, unsigned bit_width, unsigned exception_bit_width, bool interleaved, unsigned char *out)
 199 {
 200         *out++ = exception_bit_width;
 201
 202         // Bitmap of exceptions.
 203         {
 204                 BitWriter bs(out, 1);
 205                 for (unsigned i = 0; i < num; ++i) {
 206                         bs.write((in[i] >> bit_width) != 0);
 207                 }
 208                 out += bytes_for_packed_bits(num, 1);
 209         }
 210
 211         // Exceptions.
 212         {
 213                 BitWriter bs(out, exception_bit_width);
 214                 unsigned num_exceptions = 0;
 215                 for (unsigned i = 0; i < num; ++i) {
 216                         if ((in[i] >> bit_width) != 0) {
 217                                 bs.write(in[i] >> bit_width);
 218                                 ++num_exceptions;
 219                         }
 220                 }
 221                 out += bytes_for_packed_bits(num_exceptions, exception_bit_width);
 222         }
 223
 224         // Base values.
 225         out = encode_bitmap(in, num, bit_width, interleaved, out);
 226
 227         return out;
 228 }
 229
 230 // See decode_pfor_vb() for the format.
 231 template<class Docid>
 232 unsigned char *encode_pfor_vb(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out)
 233 {
 234         unsigned num_exceptions = 0;
 235         for (unsigned i = 0; i < num; ++i) {
 236                 if ((in[i] >> bit_width) != 0) {
 237                         ++num_exceptions;
 238                 }
 239         }
 240         *out++ = num_exceptions;
 241
 242         // Base values.
 243         out = encode_bitmap(in, num, bit_width, interleaved, out);
 244
 245         // Exceptions.
 246         for (unsigned i = 0; i < num; ++i) {
 247                 unsigned val = in[i] >> bit_width;
 248                 if (val != 0) {
 249                         out = write_vb(val, out);
 250                 }
 251         }
 252
 253         // Exception indexes.
 254         for (unsigned i = 0; i < num; ++i) {
 255                 unsigned val = in[i] >> bit_width;
 256                 if (val != 0) {
 257                         *out++ = i;
 258                 }
 259         }
 260
 261         return out;
 262 }
 263
 264 // Find out which block type would be the smallest for the given data.
 265 template<class Docid>
 266 BlockType decide_block_type(const Docid *in, unsigned num, unsigned *bit_width, unsigned *exception_bit_width)
 267 {
 268         // Check if the block is constant.
 269         bool constant = true;
 270         for (unsigned i = 1; i < num; ++i) {
 271                 if (in[i] != in[0]) {
 272                         constant = false;
 273                         break;
 274                 }
 275         }
 276         if (constant) {
 277                 *bit_width = num_bits(in[0]);
 278                 return BlockType::CONSTANT;
 279         }
 280
 281         // Build up a histogram of bit sizes.
 282         unsigned histogram[sizeof(Docid) * CHAR_BIT + 1] = { 0 };
 283         unsigned max_bits = 0;
 284         for (unsigned i = 0; i < num; ++i) {
 285                 unsigned bits = num_bits(in[i]);
 286                 ++histogram[bits];
 287                 max_bits = std::max(max_bits, bits);
 288         }
 289
 290         // Straight-up FOR.
 291         unsigned best_cost = bytes_for_packed_bits(num, max_bits);
 292         unsigned best_bit_width = max_bits;
 293
 294         // Try PFOR with bitmap exceptions.
 295         const unsigned bitmap_cost = bytes_for_packed_bits(num, 1);
 296         unsigned num_exceptions = 0;
 297         for (unsigned exception_bit_width = 1; exception_bit_width <= max_bits; ++exception_bit_width) {
 298                 unsigned test_bit_width = max_bits - exception_bit_width;
 299                 num_exceptions += histogram[test_bit_width + 1];
 300
 301                 // 1 byte for signaling exception bit width, then the bitmap,
 302                 // then the base values, then the exceptions.
 303                 unsigned cost = 1 + bitmap_cost + bytes_for_packed_bits(num, test_bit_width) +
 304                         bytes_for_packed_bits(num_exceptions, exception_bit_width);
 305                 if (cost < best_cost) {
 306                         best_cost = cost;
 307                         best_bit_width = test_bit_width;
 308                 }
 309         }
 310
 311         // Try PFOR with varbyte exceptions.
 312         bool best_is_varbyte = false;
 313         for (unsigned test_bit_width = 0; test_bit_width < max_bits; ++test_bit_width) {
 314                 // 1 byte for signaling number of exceptions, plus the base values,
 315                 // and then we count up the varbytes and indexes. (This is precise
 316                 // but very slow.)
 317                 unsigned cost = 1 + bytes_for_packed_bits(num, test_bit_width);
 318                 for (unsigned i = 0; i < num && cost < best_cost; ++i) {
 319                         unsigned val = in[i] >> test_bit_width;
 320                         if (val == 0) {
 321                                 // Not stored, and then also no index.
 322                         } else if (val <= 176) {
 323                                 cost += 2;
 324                         } else if (val <= 16560) {
 325                                 cost += 3;
 326                         } else if (val <= 540848) {
 327                                 cost += 4;
 328                         } else if (val <= 16777215) {
 329                                 cost += 5;
 330                         } else {
 331                                 cost += 6;
 332                         }
 333                 }
 334                 if (cost < best_cost) {
 335                         best_cost = cost;
 336                         best_bit_width = test_bit_width;
 337                         best_is_varbyte = true;
 338                 }
 339         }
 340
 341         // TODO: Consider the last-resort option of just raw storage (255).
 342
 343         if (best_is_varbyte) {
 344                 *bit_width = best_bit_width;
 345                 return BlockType::PFOR_VB;
 346         } else if (best_bit_width == max_bits) {
 347                 *bit_width = max_bits;
 348                 return BlockType::FOR;
 349         } else {
 350                 *bit_width = best_bit_width;
 351                 *exception_bit_width = max_bits - best_bit_width;
 352                 return BlockType::PFOR_BITMAP;
 353         }
 354 }
 355
 356 // The basic entry point. Takes one block of integers (which already must
 357 // be delta-minus-1-encoded) and packs it into TurboPFor format.
 358 // interleaved corresponds to the interleaved parameter in decode_pfor_delta1()
 359 // or the “128v” infix in the reference code's function names; such formats
 360 // are much faster to decode, so for full blocks, you probably want it.
 361 // The interleaved flag isn't stored anywhere; it's implicit whether you
 362 // want to use it for full blocks or not.
 363 //
 364 // The first value must already be written using write_baseval() (so the delta
 365 // coding starts from the second value). Returns the end of the string.
 366 // May write 4 bytes past the end.
 367 template<unsigned BlockSize, class Docid>
 368 unsigned char *encode_pfor_single_block(const Docid *in, unsigned num, bool interleaved, unsigned char *out)
 369 {
 370         assert(num > 0);
 371         if (interleaved) {
 372                 assert(num == BlockSize);
 373         }
 374
 375         unsigned bit_width, exception_bit_width;
 376         BlockType block_type = decide_block_type(in, num, &bit_width, &exception_bit_width);
 377         *out++ = (block_type << 6) | bit_width;
 378
 379         switch (block_type) {
 380         case BlockType::CONSTANT: {
 381                 unsigned bit_width = num_bits(in[0]);
 382                 write_le<Docid>(in[0], out);
 383                 return out + div_round_up(bit_width, 8);
 384         }
 385         case BlockType::FOR:
 386                 return encode_for(in, num, bit_width, interleaved, out);
 387         case BlockType::PFOR_BITMAP:
 388                 return encode_pfor_bitmap(in, num, bit_width, exception_bit_width, interleaved, out);
 389         case BlockType::PFOR_VB:
 390                 return encode_pfor_vb(in, num, bit_width, interleaved, out);
 391         default:
 392                 assert(false);
 393         }
 394 }
 395
 396 #endif  // !defined(_TURBOPFOR_ENCODE_H)