1 #ifndef _TURBOPFOR_ENCODE_H
2 #define _TURBOPFOR_ENCODE_H
4 // Much like turbopfor.h (and shares all of the same caveats), except this is
5 // for encoding. It is _much_ slower than the reference implementation, but we
6 // encode only during build, and most time in build is spent in other things
7 // than encoding posting lists, so it only costs ~5-10% overall. Does not use
8 // any special instruction sets, and generally isn't optimized at all.
10 // It encodes roughly about as dense as the reference encoder.
12 #include "turbopfor-common.h"
24 void write_le(Docid val, void *out)
26 if constexpr (sizeof(Docid) == 8) {
28 } else if constexpr (sizeof(Docid) == 4) {
30 } else if constexpr (sizeof(Docid) == 2) {
32 } else if constexpr (sizeof(Docid) == 1) {
37 memcpy(out, &val, sizeof(val));
40 // Corresponds to read_baseval.
42 unsigned char *write_baseval(Docid in, unsigned char *out)
47 } else if (in < 0x4000) {
48 out[0] = (in >> 8) | 0x80;
51 } else if (in < 0x200000) {
52 out[0] = (in >> 16) | 0xc0;
54 out[2] = (in >> 8) & 0xff;
56 } else if (in < 0x10000000) {
57 out[0] = (in >> 24) | 0xe0;
58 out[1] = (in >> 16) & 0xff;
59 out[2] = (in >> 8) & 0xff;
63 assert(false); // Not implemented.
67 // Writes a varbyte-encoded exception.
69 unsigned char *write_vb(Docid val, unsigned char *out)
74 } else if (val <= 16560) {
76 *out++ = (val >> 8) + 177;
79 } else if (val <= 540848) {
81 *out = (val >> 16) + 241;
82 write_le<uint16_t>(val & 0xffff, out + 1);
84 } else if (val <= 16777215) {
86 write_le<uint32_t>(val, out + 1);
90 write_le<uint32_t>(val, out + 1);
96 inline unsigned num_bits(Docid x)
102 return sizeof(Docid) * CHAR_BIT - __builtin_clz(x);
105 for (int i = sizeof(Docid) * CHAR_BIT; i-- > 0;) {
106 if (x & (Docid{ 1 } << i)) {
116 BitWriter(unsigned char *out, unsigned bits)
117 : out(out), bits(bits) {}
118 void write(uint32_t val)
120 cur_val |= val << bits_used;
121 write_le<uint32_t>(cur_val, out);
124 cur_val >>= (bits_used / 8) * 8;
125 out += bits_used / 8;
132 unsigned bits_used = 0;
133 unsigned cur_val = 0;
136 template<unsigned NumStreams>
137 struct InterleavedBitWriter {
139 InterleavedBitWriter(unsigned char *out, unsigned bits)
140 : out(out), bits(bits) {}
141 void write(uint32_t val)
143 cur_val |= uint64_t(val) << bits_used;
144 if (bits_used + bits >= 32) {
145 write_le<uint32_t>(cur_val & 0xffffffff, out);
148 bits_used -= 32; // Underflow, but will be fixed below.
150 write_le<uint32_t>(cur_val, out);
155 static constexpr unsigned Stride = NumStreams * sizeof(uint32_t);
158 unsigned bits_used = 0;
159 uint64_t cur_val = 0;
162 // Bitpacks a set of values (making sure the top bits are lopped off).
163 // If interleaved is set, makes SSE2-compatible interleaving (this is
164 // only allowed for full blocks).
165 template<class Docid>
166 unsigned char *encode_bitmap(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out)
168 unsigned mask = mask_for_bits(bit_width);
170 InterleavedBitWriter<4> bs0(out + 0 * sizeof(uint32_t), bit_width);
171 InterleavedBitWriter<4> bs1(out + 1 * sizeof(uint32_t), bit_width);
172 InterleavedBitWriter<4> bs2(out + 2 * sizeof(uint32_t), bit_width);
173 InterleavedBitWriter<4> bs3(out + 3 * sizeof(uint32_t), bit_width);
174 assert(num % 4 == 0);
175 for (unsigned i = 0; i < num / 4; ++i) {
176 bs0.write(in[i * 4 + 0] & mask);
177 bs1.write(in[i * 4 + 1] & mask);
178 bs2.write(in[i * 4 + 2] & mask);
179 bs3.write(in[i * 4 + 3] & mask);
182 BitWriter bs(out, bit_width);
183 for (unsigned i = 0; i < num; ++i) {
184 bs.write(in[i] & mask);
187 return out + bytes_for_packed_bits(num, bit_width);
190 // See decode_for() for the format.
191 template<class Docid>
192 unsigned char *encode_for(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out)
194 return encode_bitmap(in, num, bit_width, interleaved, out);
197 // See decode_pfor_bitmap() for the format.
198 template<class Docid>
199 unsigned char *encode_pfor_bitmap(const Docid *in, unsigned num, unsigned bit_width, unsigned exception_bit_width, bool interleaved, unsigned char *out)
201 *out++ = exception_bit_width;
203 // Bitmap of exceptions.
205 BitWriter bs(out, 1);
206 for (unsigned i = 0; i < num; ++i) {
207 bs.write((in[i] >> bit_width) != 0);
209 out += bytes_for_packed_bits(num, 1);
214 BitWriter bs(out, exception_bit_width);
215 unsigned num_exceptions = 0;
216 for (unsigned i = 0; i < num; ++i) {
217 if ((in[i] >> bit_width) != 0) {
218 bs.write(in[i] >> bit_width);
222 out += bytes_for_packed_bits(num_exceptions, exception_bit_width);
226 out = encode_bitmap(in, num, bit_width, interleaved, out);
231 // See decode_pfor_vb() for the format.
232 template<class Docid>
233 unsigned char *encode_pfor_vb(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out)
235 unsigned num_exceptions = 0;
236 for (unsigned i = 0; i < num; ++i) {
237 if ((in[i] >> bit_width) != 0) {
241 *out++ = num_exceptions;
244 out = encode_bitmap(in, num, bit_width, interleaved, out);
247 for (unsigned i = 0; i < num; ++i) {
248 unsigned val = in[i] >> bit_width;
250 out = write_vb(val, out);
254 // Exception indexes.
255 for (unsigned i = 0; i < num; ++i) {
256 unsigned val = in[i] >> bit_width;
265 // Find out which block type would be the smallest for the given data.
266 template<class Docid>
267 BlockType decide_block_type(const Docid *in, unsigned num, unsigned *bit_width, unsigned *exception_bit_width)
269 // Check if the block is constant.
270 bool constant = true;
271 for (unsigned i = 1; i < num; ++i) {
272 if (in[i] != in[0]) {
278 *bit_width = num_bits(in[0]);
279 return BlockType::CONSTANT;
282 // Build up a histogram of bit sizes.
283 unsigned histogram[sizeof(Docid) * CHAR_BIT + 1] = { 0 };
284 unsigned max_bits = 0;
285 for (unsigned i = 0; i < num; ++i) {
286 unsigned bits = num_bits(in[i]);
288 max_bits = std::max(max_bits, bits);
292 unsigned best_cost = bytes_for_packed_bits(num, max_bits);
293 unsigned best_bit_width = max_bits;
295 // Try PFOR with bitmap exceptions.
296 const unsigned bitmap_cost = bytes_for_packed_bits(num, 1);
297 unsigned num_exceptions = 0;
298 for (unsigned exception_bit_width = 1; exception_bit_width <= max_bits; ++exception_bit_width) {
299 unsigned test_bit_width = max_bits - exception_bit_width;
300 num_exceptions += histogram[test_bit_width + 1];
302 // 1 byte for signaling exception bit width, then the bitmap,
303 // then the base values, then the exceptions.
304 unsigned cost = 1 + bitmap_cost + bytes_for_packed_bits(num, test_bit_width) +
305 bytes_for_packed_bits(num_exceptions, exception_bit_width);
306 if (cost < best_cost) {
308 best_bit_width = test_bit_width;
312 // Make the histogram cumulative; histogram[n] now means the the number of
313 // elements with n bits or more.
314 for (unsigned i = max_bits; i > 0; --i) {
315 histogram[i - 1] += histogram[i];
318 // Try PFOR with varbyte exceptions.
319 bool best_is_varbyte = false;
320 for (unsigned test_bit_width = 0; test_bit_width < max_bits; ++test_bit_width) {
321 // 1 byte for signaling number of exceptions, plus the base values,
322 // and then we estimate up the varbytes and indexes using histogram
323 // indexes. This isn't exact, but it only helps ~0.1% on the total
325 unsigned cost = 1 + bytes_for_packed_bits(num, test_bit_width);
326 if (cost >= best_cost) {
329 if (test_bit_width + 1 <= max_bits) {
330 cost += 2 * histogram[test_bit_width + 1]; // > 0.
331 if (test_bit_width + 7 <= max_bits) {
332 cost += histogram[test_bit_width + 7]; // > 176, very roughly.
333 if (test_bit_width + 14 <= max_bits) {
334 cost += histogram[test_bit_width + 14]; // > 16560, very roughly.
335 if (test_bit_width + 19 <= max_bits) {
336 cost += histogram[test_bit_width + 19]; // > 540848, very roughly.
337 if (test_bit_width + 24 <= max_bits) {
338 cost += histogram[test_bit_width + 24]; // > 16777215, very roughly.
344 if (cost < best_cost) {
346 best_bit_width = test_bit_width;
347 best_is_varbyte = true;
351 // TODO: Consider the last-resort option of just raw storage (255).
353 if (best_is_varbyte) {
354 *bit_width = best_bit_width;
355 return BlockType::PFOR_VB;
356 } else if (best_bit_width == max_bits) {
357 *bit_width = max_bits;
358 return BlockType::FOR;
360 *bit_width = best_bit_width;
361 *exception_bit_width = max_bits - best_bit_width;
362 return BlockType::PFOR_BITMAP;
366 // The basic entry point. Takes one block of integers (which already must
367 // be delta-minus-1-encoded) and packs it into TurboPFor format.
368 // interleaved corresponds to the interleaved parameter in decode_pfor_delta1()
369 // or the ā128vā infix in the reference code's function names; such formats
370 // are much faster to decode, so for full blocks, you probably want it.
371 // The interleaved flag isn't stored anywhere; it's implicit whether you
372 // want to use it for full blocks or not.
374 // The first value must already be written using write_baseval() (so the delta
375 // coding starts from the second value). Returns the end of the string.
376 // May write 4 bytes past the end.
377 template<unsigned BlockSize, class Docid>
378 unsigned char *encode_pfor_single_block(const Docid *in, unsigned num, bool interleaved, unsigned char *out)
382 assert(num == BlockSize);
385 unsigned bit_width, exception_bit_width;
386 BlockType block_type = decide_block_type(in, num, &bit_width, &exception_bit_width);
387 *out++ = (block_type << 6) | bit_width;
389 switch (block_type) {
390 case BlockType::CONSTANT: {
391 unsigned bit_width = num_bits(in[0]);
392 write_le<Docid>(in[0], out);
393 return out + div_round_up(bit_width, 8);
396 return encode_for(in, num, bit_width, interleaved, out);
397 case BlockType::PFOR_BITMAP:
398 return encode_pfor_bitmap(in, num, bit_width, exception_bit_width, interleaved, out);
399 case BlockType::PFOR_VB:
400 return encode_pfor_vb(in, num, bit_width, interleaved, out);
406 #endif // !defined(_TURBOPFOR_ENCODE_H)