X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=turbopfor-encode.h;h=66396fd93c5f44b0828279a1eb4d45a4cfe3cb5b;hb=fd6198891d6fd9642effc0843fef6f23b991af3e;hp=532aa19d07b01faf6643f54153ac019387907d38;hpb=0f7ca618fb8a2e501fe68e1760de9ee716e37c40;p=plocate diff --git a/turbopfor-encode.h b/turbopfor-encode.h index 532aa19..66396fd 100644 --- a/turbopfor-encode.h +++ b/turbopfor-encode.h @@ -5,17 +5,20 @@ // for encoding. It is _much_ slower than the reference implementation, but we // encode only during build, and most time in build is spent in other things // than encoding posting lists, so it only costs ~5-10% overall. Does not use -// any special character sets, and generally isn't optimized at all. +// any special instruction sets, and generally isn't optimized at all. // -// It encodes about 0.01% denser than the reference encoder (averaged over -// a real plocate corpus), probably since it has a slower but more precise -// method for estimating the cost of a PFOR + varbyte block. +// It encodes roughly about as dense as the reference encoder. #include "turbopfor-common.h" +#include #include +#ifdef HAS_ENDIAN_H +#include +#endif #include #include +#include template void write_le(Docid val, void *out) @@ -27,7 +30,7 @@ void write_le(Docid val, void *out) } else if constexpr (sizeof(Docid) == 2) { val = htole16(val); } else if constexpr (sizeof(Docid) == 1) { - val = val; + // No change. } else { assert(false); } @@ -50,6 +53,12 @@ unsigned char *write_baseval(Docid in, unsigned char *out) out[1] = in & 0xff; out[2] = (in >> 8) & 0xff; return out + 3; + } else if (in < 0x10000000) { + out[0] = (in >> 24) | 0xe0; + out[1] = (in >> 16) & 0xff; + out[2] = (in >> 8) & 0xff; + out[3] = in & 0xff; + return out + 4; } else { assert(false); // Not implemented. } @@ -86,11 +95,20 @@ unsigned char *write_vb(Docid val, unsigned char *out) template inline unsigned num_bits(Docid x) { +#ifdef __GNUC__ if (x == 0) { return 0; } else { return sizeof(Docid) * CHAR_BIT - __builtin_clz(x); } +#else + for (int i = sizeof(Docid) * CHAR_BIT; i-- > 0;) { + if (x & (Docid{ 1 } << i)) { + return i; + } + } + return 0; +#endif } struct BitWriter { @@ -291,27 +309,36 @@ BlockType decide_block_type(const Docid *in, unsigned num, unsigned *bit_width, } } + // Make the histogram cumulative; histogram[n] now means the the number of + // elements with n bits or more. + for (unsigned i = max_bits; i > 0; --i) { + histogram[i - 1] += histogram[i]; + } + // Try PFOR with varbyte exceptions. bool best_is_varbyte = false; for (unsigned test_bit_width = 0; test_bit_width < max_bits; ++test_bit_width) { // 1 byte for signaling number of exceptions, plus the base values, - // and then we count up the varbytes and indexes. (This is precise - // but very slow.) + // and then we estimate up the varbytes and indexes using histogram + // indexes. This isn't exact, but it only helps ~0.1% on the total + // plocate.db size. unsigned cost = 1 + bytes_for_packed_bits(num, test_bit_width); - for (unsigned i = 0; i < num && cost < best_cost; ++i) { - unsigned val = in[i] >> test_bit_width; - if (val == 0) { - // Not stored, and then also no index. - } else if (val <= 176) { - cost += 2; - } else if (val <= 16560) { - cost += 3; - } else if (val <= 540848) { - cost += 4; - } else if (val <= 16777215) { - cost += 5; - } else { - cost += 6; + if (cost >= best_cost) { + break; + } + if (test_bit_width + 1 <= max_bits) { + cost += 2 * histogram[test_bit_width + 1]; // > 0. + if (test_bit_width + 7 <= max_bits) { + cost += histogram[test_bit_width + 7]; // > 176, very roughly. + if (test_bit_width + 14 <= max_bits) { + cost += histogram[test_bit_width + 14]; // > 16560, very roughly. + if (test_bit_width + 19 <= max_bits) { + cost += histogram[test_bit_width + 19]; // > 540848, very roughly. + if (test_bit_width + 24 <= max_bits) { + cost += histogram[test_bit_width + 24]; // > 16777215, very roughly. + } + } + } } } if (cost < best_cost) {