1 #ifndef _TURBOPFOR_ENCODE_H
2 #define _TURBOPFOR_ENCODE_H
4 // Much like turbopfor.h (and shares all of the same caveats), except this is
5 // for encoding. It is _much_ slower than the reference implementation, but we
6 // encode only during build, and most time in build is spent in other things
7 // than encoding posting lists, so it only costs ~5-10% overall. Does not use
8 // any special character sets, and generally isn't optimized at all.
10 // It encodes about 0.01% denser than the reference encoder (averaged over
11 // a real plocate corpus), probably since it has a slower but more precise
12 // method for estimating the cost of a PFOR + varbyte block.
14 #include "turbopfor-common.h"
26 void write_le(Docid val, void *out)
28 if constexpr (sizeof(Docid) == 8) {
30 } else if constexpr (sizeof(Docid) == 4) {
32 } else if constexpr (sizeof(Docid) == 2) {
34 } else if constexpr (sizeof(Docid) == 1) {
39 memcpy(out, &val, sizeof(val));
42 // Corresponds to read_baseval.
44 unsigned char *write_baseval(Docid in, unsigned char *out)
49 } else if (in < 0x4000) {
50 out[0] = (in >> 8) | 0x80;
53 } else if (in < 0x200000) {
54 out[0] = (in >> 16) | 0xc0;
56 out[2] = (in >> 8) & 0xff;
58 } else if (in < 0x10000000) {
59 out[0] = (in >> 24) | 0xe0;
60 out[1] = (in >> 16) & 0xff;
61 out[2] = (in >> 8) & 0xff;
65 assert(false); // Not implemented.
69 // Writes a varbyte-encoded exception.
71 unsigned char *write_vb(Docid val, unsigned char *out)
76 } else if (val <= 16560) {
78 *out++ = (val >> 8) + 177;
81 } else if (val <= 540848) {
83 *out = (val >> 16) + 241;
84 write_le<uint16_t>(val & 0xffff, out + 1);
86 } else if (val <= 16777215) {
88 write_le<uint32_t>(val, out + 1);
92 write_le<uint32_t>(val, out + 1);
98 inline unsigned num_bits(Docid x)
104 return sizeof(Docid) * CHAR_BIT - __builtin_clz(x);
107 for (int i = sizeof(Docid) * CHAR_BIT; i-- > 0;) {
108 if (x & (Docid{ 1 } << i)) {
118 BitWriter(unsigned char *out, unsigned bits)
119 : out(out), bits(bits) {}
120 void write(uint32_t val)
122 cur_val |= val << bits_used;
123 write_le<uint32_t>(cur_val, out);
126 cur_val >>= (bits_used / 8) * 8;
127 out += bits_used / 8;
134 unsigned bits_used = 0;
135 unsigned cur_val = 0;
138 template<unsigned NumStreams>
139 struct InterleavedBitWriter {
141 InterleavedBitWriter(unsigned char *out, unsigned bits)
142 : out(out), bits(bits) {}
143 void write(uint32_t val)
145 cur_val |= uint64_t(val) << bits_used;
146 if (bits_used + bits >= 32) {
147 write_le<uint32_t>(cur_val & 0xffffffff, out);
150 bits_used -= 32; // Underflow, but will be fixed below.
152 write_le<uint32_t>(cur_val, out);
157 static constexpr unsigned Stride = NumStreams * sizeof(uint32_t);
160 unsigned bits_used = 0;
161 uint64_t cur_val = 0;
164 // Bitpacks a set of values (making sure the top bits are lopped off).
165 // If interleaved is set, makes SSE2-compatible interleaving (this is
166 // only allowed for full blocks).
167 template<class Docid>
168 unsigned char *encode_bitmap(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out)
170 unsigned mask = mask_for_bits(bit_width);
172 InterleavedBitWriter<4> bs0(out + 0 * sizeof(uint32_t), bit_width);
173 InterleavedBitWriter<4> bs1(out + 1 * sizeof(uint32_t), bit_width);
174 InterleavedBitWriter<4> bs2(out + 2 * sizeof(uint32_t), bit_width);
175 InterleavedBitWriter<4> bs3(out + 3 * sizeof(uint32_t), bit_width);
176 assert(num % 4 == 0);
177 for (unsigned i = 0; i < num / 4; ++i) {
178 bs0.write(in[i * 4 + 0] & mask);
179 bs1.write(in[i * 4 + 1] & mask);
180 bs2.write(in[i * 4 + 2] & mask);
181 bs3.write(in[i * 4 + 3] & mask);
184 BitWriter bs(out, bit_width);
185 for (unsigned i = 0; i < num; ++i) {
186 bs.write(in[i] & mask);
189 return out + bytes_for_packed_bits(num, bit_width);
192 // See decode_for() for the format.
193 template<class Docid>
194 unsigned char *encode_for(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out)
196 return encode_bitmap(in, num, bit_width, interleaved, out);
199 // See decode_pfor_bitmap() for the format.
200 template<class Docid>
201 unsigned char *encode_pfor_bitmap(const Docid *in, unsigned num, unsigned bit_width, unsigned exception_bit_width, bool interleaved, unsigned char *out)
203 *out++ = exception_bit_width;
205 // Bitmap of exceptions.
207 BitWriter bs(out, 1);
208 for (unsigned i = 0; i < num; ++i) {
209 bs.write((in[i] >> bit_width) != 0);
211 out += bytes_for_packed_bits(num, 1);
216 BitWriter bs(out, exception_bit_width);
217 unsigned num_exceptions = 0;
218 for (unsigned i = 0; i < num; ++i) {
219 if ((in[i] >> bit_width) != 0) {
220 bs.write(in[i] >> bit_width);
224 out += bytes_for_packed_bits(num_exceptions, exception_bit_width);
228 out = encode_bitmap(in, num, bit_width, interleaved, out);
233 // See decode_pfor_vb() for the format.
234 template<class Docid>
235 unsigned char *encode_pfor_vb(const Docid *in, unsigned num, unsigned bit_width, bool interleaved, unsigned char *out)
237 unsigned num_exceptions = 0;
238 for (unsigned i = 0; i < num; ++i) {
239 if ((in[i] >> bit_width) != 0) {
243 *out++ = num_exceptions;
246 out = encode_bitmap(in, num, bit_width, interleaved, out);
249 for (unsigned i = 0; i < num; ++i) {
250 unsigned val = in[i] >> bit_width;
252 out = write_vb(val, out);
256 // Exception indexes.
257 for (unsigned i = 0; i < num; ++i) {
258 unsigned val = in[i] >> bit_width;
267 // Find out which block type would be the smallest for the given data.
268 template<class Docid>
269 BlockType decide_block_type(const Docid *in, unsigned num, unsigned *bit_width, unsigned *exception_bit_width)
271 // Check if the block is constant.
272 bool constant = true;
273 for (unsigned i = 1; i < num; ++i) {
274 if (in[i] != in[0]) {
280 *bit_width = num_bits(in[0]);
281 return BlockType::CONSTANT;
284 // Build up a histogram of bit sizes.
285 unsigned histogram[sizeof(Docid) * CHAR_BIT + 1] = { 0 };
286 unsigned max_bits = 0;
287 for (unsigned i = 0; i < num; ++i) {
288 unsigned bits = num_bits(in[i]);
290 max_bits = std::max(max_bits, bits);
294 unsigned best_cost = bytes_for_packed_bits(num, max_bits);
295 unsigned best_bit_width = max_bits;
297 // Try PFOR with bitmap exceptions.
298 const unsigned bitmap_cost = bytes_for_packed_bits(num, 1);
299 unsigned num_exceptions = 0;
300 for (unsigned exception_bit_width = 1; exception_bit_width <= max_bits; ++exception_bit_width) {
301 unsigned test_bit_width = max_bits - exception_bit_width;
302 num_exceptions += histogram[test_bit_width + 1];
304 // 1 byte for signaling exception bit width, then the bitmap,
305 // then the base values, then the exceptions.
306 unsigned cost = 1 + bitmap_cost + bytes_for_packed_bits(num, test_bit_width) +
307 bytes_for_packed_bits(num_exceptions, exception_bit_width);
308 if (cost < best_cost) {
310 best_bit_width = test_bit_width;
314 // Try PFOR with varbyte exceptions.
315 bool best_is_varbyte = false;
316 for (unsigned test_bit_width = 0; test_bit_width < max_bits; ++test_bit_width) {
317 // 1 byte for signaling number of exceptions, plus the base values,
318 // and then we count up the varbytes and indexes. (This is precise
320 unsigned cost = 1 + bytes_for_packed_bits(num, test_bit_width);
321 for (unsigned i = 0; i < num && cost < best_cost; ++i) {
322 unsigned val = in[i] >> test_bit_width;
324 // Not stored, and then also no index.
325 } else if (val <= 176) {
327 } else if (val <= 16560) {
329 } else if (val <= 540848) {
331 } else if (val <= 16777215) {
337 if (cost < best_cost) {
339 best_bit_width = test_bit_width;
340 best_is_varbyte = true;
344 // TODO: Consider the last-resort option of just raw storage (255).
346 if (best_is_varbyte) {
347 *bit_width = best_bit_width;
348 return BlockType::PFOR_VB;
349 } else if (best_bit_width == max_bits) {
350 *bit_width = max_bits;
351 return BlockType::FOR;
353 *bit_width = best_bit_width;
354 *exception_bit_width = max_bits - best_bit_width;
355 return BlockType::PFOR_BITMAP;
359 // The basic entry point. Takes one block of integers (which already must
360 // be delta-minus-1-encoded) and packs it into TurboPFor format.
361 // interleaved corresponds to the interleaved parameter in decode_pfor_delta1()
362 // or the ā128vā infix in the reference code's function names; such formats
363 // are much faster to decode, so for full blocks, you probably want it.
364 // The interleaved flag isn't stored anywhere; it's implicit whether you
365 // want to use it for full blocks or not.
367 // The first value must already be written using write_baseval() (so the delta
368 // coding starts from the second value). Returns the end of the string.
369 // May write 4 bytes past the end.
370 template<unsigned BlockSize, class Docid>
371 unsigned char *encode_pfor_single_block(const Docid *in, unsigned num, bool interleaved, unsigned char *out)
375 assert(num == BlockSize);
378 unsigned bit_width, exception_bit_width;
379 BlockType block_type = decide_block_type(in, num, &bit_width, &exception_bit_width);
380 *out++ = (block_type << 6) | bit_width;
382 switch (block_type) {
383 case BlockType::CONSTANT: {
384 unsigned bit_width = num_bits(in[0]);
385 write_le<Docid>(in[0], out);
386 return out + div_round_up(bit_width, 8);
389 return encode_for(in, num, bit_width, interleaved, out);
390 case BlockType::PFOR_BITMAP:
391 return encode_pfor_bitmap(in, num, bit_width, exception_bit_width, interleaved, out);
392 case BlockType::PFOR_VB:
393 return encode_pfor_vb(in, num, bit_width, interleaved, out);
399 #endif // !defined(_TURBOPFOR_ENCODE_H)