From e11ae4e2eabde08a2bdf5ede631ae243822b6c7c Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Sun, 4 Oct 2020 23:49:52 +0200 Subject: [PATCH] Start reimplementing the TurboPFor decoding functions. --- Makefile | 5 + bench.cpp | 88 ++++++++++++++++ plocate.cpp | 7 +- turbopfor.h | 296 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 394 insertions(+), 2 deletions(-) create mode 100644 bench.cpp create mode 100644 turbopfor.h diff --git a/Makefile b/Makefile index c51c41f..9fb360f 100644 --- a/Makefile +++ b/Makefile @@ -30,4 +30,9 @@ install: all $(INSTALL) -m 0755 plocate-build $(PREFIX)/sbin/ $(INSTALL) -m 0755 update-plocate.sh /etc/cron.daily/plocate +bench.o: bench.cpp turbopfor.h + +bench: bench.o io_uring_engine.o TurboPFor-Integer-Compression/libic.a + $(CXX) -o $@ $^ $(URING_LIBS) + .PHONY: clean install diff --git a/bench.cpp b/bench.cpp new file mode 100644 index 0000000..46b6ed0 --- /dev/null +++ b/bench.cpp @@ -0,0 +1,88 @@ +#include +#include +#include +#include +#include + +#define dprintf(...) +//#define dprintf(...) fprintf(stderr, __VA_ARGS__); + +#include "turbopfor.h" +#include "vp4.h" +#include "db.h" +#include "io_uring_engine.h" + +using namespace std; +using namespace std::chrono; + +int main(void) +{ + int fd = open("plocate.db", O_RDONLY); + if (fd == -1) { + perror("plocate.db"); + exit(1); + } + + Header hdr; + complete_pread(fd, &hdr, sizeof(hdr), /*offset=*/0); + + unique_ptr ht(new Trigram[hdr.hashtable_size + hdr.extra_ht_slots + 1]); + complete_pread(fd, ht.get(), (hdr.hashtable_size + hdr.extra_ht_slots + 1) * sizeof(Trigram), hdr.hash_table_offset_bytes); + + uint32_t longest_pl = 0; + vector> posting_lists; + for (unsigned i = 0; i < hdr.hashtable_size + hdr.extra_ht_slots; ++i) { + if (ht[i].num_docids == 0) { + continue; + } + size_t len = ht[i + 1].offset - ht[i].offset; + string str; + str.resize(len); + complete_pread(fd, &str[0], len, ht[i].offset); + posting_lists.emplace_back(move(str), ht[i].num_docids); + longest_pl = std::max(ht[i].num_docids, longest_pl); + } + ht.reset(); + fprintf(stderr, "Read %zu posting lists.\n", posting_lists.size()); + + size_t num_errors = 0; + for (auto &[pl, num_docids] : posting_lists) { + //fprintf(stderr, "%zu bytes, %u docids\n", pl.size(), num_docids); + vector out1, out2; + out1.resize(num_docids + 128); + out2.resize(num_docids + 128); + unsigned char *pldata = reinterpret_cast(&pl[0]); + p4nd1dec32(pldata, num_docids, &out1[0]); + decode_pfor_delta1<128>(pldata, num_docids, &out2[0]); + for (unsigned i = 0; i < num_docids; ++i) { + if (out1[i] != out2[i]) { + for (unsigned j = 0; j < num_docids; ++j) { + fprintf(stderr, "%3u: reference=%u ours=%u (diff=%d)\n", j, out1[j], out2[j], out1[j] - out2[j]); + } + ++num_errors; + break; + } + } + } + fprintf(stderr, "%zu/%zu posting lists had errors in decoding.\n", num_errors, posting_lists.size()); + + vector dummy; + dummy.resize(longest_pl + 128); + steady_clock::time_point start = steady_clock::now(); + for (auto &[pl, num_docids] : posting_lists) { + unsigned char *pldata = reinterpret_cast(&pl[0]); + p4nd1dec32(pldata, num_docids, &dummy[0]); + } + steady_clock::time_point end = steady_clock::now(); + double reference_sec = duration(end - start).count(); + fprintf(stderr, "Decoding with reference implementation: %.1f ms\n", 1e3 * reference_sec); + + start = steady_clock::now(); + for (auto &[pl, num_docids] : posting_lists) { + unsigned char *pldata = reinterpret_cast(&pl[0]); + decode_pfor_delta1<128>(pldata, num_docids, &dummy[0]); + } + end = steady_clock::now(); + double own_sec = duration(end - start).count(); + fprintf(stderr, "Decoding with own implementation: %.1f ms (%.2f%% speed)\n", 1e3 * own_sec, 100.0 * reference_sec / own_sec); +} diff --git a/plocate.cpp b/plocate.cpp index dc64b38..9b1f234 100644 --- a/plocate.cpp +++ b/plocate.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -25,6 +26,8 @@ using namespace std::chrono; #define dprintf(...) //#define dprintf(...) fprintf(stderr, __VA_ARGS__); +#include "turbopfor.h" + const char *dbpath = "/var/lib/mlocate/plocate.db"; bool print_nul = false; @@ -388,7 +391,7 @@ void do_search_file(const vector &needles, const char *filename) unsigned char *pldata = reinterpret_cast(s.data()); if (in1.empty()) { in1.resize(num + 128); - p4nd1dec32(pldata, num, &in1[0]); + decode_pfor_delta1<128>(pldata, num, &in1[0]); in1.resize(num); dprintf("trigram '%c%c%c' (%zu bytes) decoded to %zu entries\n", trgm & 0xff, (trgm >> 8) & 0xff, (trgm >> 16) & 0xff, len, num); @@ -396,7 +399,7 @@ void do_search_file(const vector &needles, const char *filename) if (in2.size() < num + 128) { in2.resize(num + 128); } - p4nd1dec32(pldata, num, &in2[0]); + decode_pfor_delta1<128>(pldata, num, &in2[0]); out.clear(); set_intersection(in1.begin(), in1.end(), in2.begin(), in2.begin() + num, diff --git a/turbopfor.h b/turbopfor.h new file mode 100644 index 0000000..a7913db --- /dev/null +++ b/turbopfor.h @@ -0,0 +1,296 @@ +#ifndef _TURBOPFOR_H +#define _TURBOPFOR_H 1 + +// A reimplementation of parts of the TurboPFor codecs, using the same +// storage format. These are not as fast as the reference implementation +// (about 1/3 of the performance), and do not support the same breadth of +// codecs (in particular, only delta-plus-1 is implemented, and only 32-bit +// docids are tested), but aim to be more portable and easier-to-understand. +// In particular, they will compile on x86 without SSE4.1 or AVX support. +// +// The main reference is https://michael.stapelberg.ch/posts/2019-02-05-turbopfor-analysis/, +// although some implementation details have been worked out by studying the +// TurboPFor code. + +#include +#include +#include +#include +#include + +#include + +template +Docid read_le(const void *in) +{ + Docid val; + memcpy(&val, in, sizeof(val)); + if constexpr (sizeof(Docid) == 8) { + return le64toh(val); + } else if constexpr (sizeof(Docid) == 4) { + return le32toh(val); + } else if constexpr (sizeof(Docid) == 2) { + return le16toh(val); + } else if constexpr (sizeof(Docid) == 1) { + return val; + } else { + assert(false); + } +} + +// Reads a single value with an encoding that looks a bit like PrefixVarint. +// It's unclear why this doesn't use the varbyte encoding. +template +const unsigned char *read_baseval(const unsigned char *in, Docid *out) +{ + //fprintf(stderr, "baseval: 0x%02x 0x%02x 0x%02x 0x%02x\n", in[0], in[1], in[2], in[3]); + if (*in < 128) { + *out = *in; + return in + 1; + } else if (*in < 192) { + *out = ((uint32_t(in[0]) << 8) | uint32_t(in[1])) & 0x3fff; + return in + 2; + } else if (*in < 224) { + *out = ((uint32_t(in[0]) << 16) | + (uint32_t(in[2]) << 8) | + (uint32_t(in[1]))) & 0x1fffff; + return in + 3; + } else { + assert(false); // Not implemented. + } +} + +template +const unsigned char *read_vb(const unsigned char *in, Docid *out) +{ + if (*in <= 176) { + *out = *in; + return in + 1; + } else if (*in <= 240) { + *out = ((uint32_t(in[0] - 177) << 8) | uint32_t(in[1])) + 177; + return in + 2; + } else if (*in <= 248) { + *out = ((uint32_t(in[0] - 241) << 16) | read_le(in + 1)) + 16561; + return in + 3; + } else if (*in == 249) { + *out = (uint32_t(in[1])) | + (uint32_t(in[2]) << 8) | + (uint32_t(in[3]) << 16); + return in + 4; + } else if (*in == 250) { + *out = read_le(in + 1); + return in + 5; + } else { + assert(false); + } +} + +struct BitReader { +public: + BitReader(const unsigned char *in, unsigned bits) + : in(in), bits(bits), mask((1U << bits) - 1) {} + uint32_t read() + { + uint32_t val = (read_le(in) >> bits_used) & mask; + + bits_used += bits; + in += bits_used / 8; + bits_used %= 8; + + return val; + } + +private: + const unsigned char *in; + const unsigned bits; + const unsigned mask; + unsigned bits_used = 0; +}; + +// Does not properly account for overflow. +inline unsigned div_round_up(unsigned val, unsigned div) +{ + return (val + div - 1) / div; +} + +inline unsigned bytes_for_packed_bits(unsigned num, unsigned bit_width) +{ + return div_round_up(num * bit_width, CHAR_BIT); +} + +// Constant block. Layout: +// +// - Bit width (6 bits) | type << 6 +// - Base values ( bits, rounded up to nearest byte) +template +const unsigned char *decode_constant(const unsigned char *in, unsigned num, Docid *out) +{ + const unsigned bit_width = *in++ & 0x3f; + Docid val = read_le(in); + if (bit_width < sizeof(Docid) * 8) { + val &= ((1U << bit_width) - 1); + } + + Docid *prev_out = out - 1; + for (unsigned i = 0; i < num; ++i) { + out[i] = val + prev_out[i] + 1; + } + return in + div_round_up(bit_width, 8); +} + +// FOR block (ie., PFor without exceptions). Layout: +// +// - Bit width (6 bits) | type << 6 +// - Base values ( values of bits, rounded up to a multiple of 32 values) +template +const unsigned char *decode_for(const unsigned char *in, unsigned num, Docid *out) +{ + const unsigned bit_width = *in++ & 0x3f; + + Docid *prev_out = out - 1; + BitReader bs(in, bit_width); + for (unsigned i = 0; i < num; ++i) { + out[i] = bs.read() + prev_out[i] + 1; + } + return in + bytes_for_packed_bits(num, bit_width); +} + +// PFor block with bitmap exceptions. Layout: +// +// - Bit width (6 bits) | type << 6 +// - Exception bit width (8 bits) +// - Bitmap of which values have exceptions ( bits, rounded up to a byte) +// - Exceptions ( values of bits, rounded up to a byte) +// - Base values ( values of bits, rounded up to a byte) +template +const unsigned char *decode_pfor_bitmap(const unsigned char *in, unsigned num, Docid *out) +{ + memset(out, 0, num * sizeof(Docid)); + + const unsigned bit_width = *in++ & 0x3f; + const unsigned exception_bit_width = *in++; + + // Decode exceptions. + { + const uint64_t *exception_bitmap_ptr = reinterpret_cast(in); + in += div_round_up(num, 8); + + int num_exceptions = 0; + + BitReader bs(in, exception_bit_width); + for (unsigned i = 0; i < num; i += 64, ++exception_bitmap_ptr) { + uint64_t exceptions = read_le(exception_bitmap_ptr); + if (num - i < 64) { + // We've read some bytes past the end, so clear out the junk bits. + exceptions &= (1ULL << (num - i)) - 1; + } + for (; exceptions != 0; exceptions &= exceptions - 1, ++num_exceptions) { + unsigned idx = (ffsll(exceptions) - 1) + i; + out[idx] = bs.read() << bit_width; + } + } + in += bytes_for_packed_bits(num_exceptions, exception_bit_width); + } + + // Decode the base values, and delta-decode. + Docid *prev_out = out - 1; + BitReader bs(in, bit_width); + for (unsigned i = 0; i < num; ++i) { + out[i] = (out[i] | bs.read()) + prev_out[i] + 1; + } + return in + bytes_for_packed_bits(num, bit_width); +} + +// PFor block with variable-byte exceptions. Layout: +// +// - Bit width (6 bits) | type << 6 +// - Number of exceptions (8 bits) +// - Base values ( values of bits, rounded up to a byte) +// - Exceptions: +// - If first byte is 255, 32-bit values (does not include the 255 byte) +// - Else, varbyte-encoded values (includes the non-255 byte) +// - Indexes of exceptions ( bytes). +template +const unsigned char *decode_pfor_vb(const unsigned char *in, unsigned num, Docid *out) +{ + //fprintf(stderr, "in=%p out=%p num=%u\n", in, out, num); + + const unsigned bit_width = *in++ & 0x3f; + unsigned num_exceptions = *in++; + + // Decode the base values. + BitReader bs(in, bit_width); + for (unsigned i = 0; i < num; ++i) { + out[i] = bs.read(); + } + in += bytes_for_packed_bits(num, bit_width); + + // Decode exceptions. + Docid exceptions[BlockSize]; + if (*in == 255) { + ++in; + for (unsigned i = 0; i < num_exceptions; ++i) { + exceptions[i] = read_le(in); + in += sizeof(Docid); + } + } else { + for (unsigned i = 0; i < num_exceptions; ++i) { + in = read_vb(in, &exceptions[i]); + } + } + // Apply exceptions. + for (unsigned i = 0; i < num_exceptions; ++i) { + unsigned idx = *in++; + out[idx] |= exceptions[i] << bit_width; + } + + // Delta-decode. + Docid *prev_out = out - 1; + for (unsigned i = 0; i < num; ++i) { + out[i] = out[i] + prev_out[i] + 1; + } + + return in; +} + +enum BlockType { + FOR = 0, + PFOR_VB = 1, + PFOR_BITMAP = 2, + CONSTANT = 3 +}; + +template +const unsigned char *decode_pfor_delta1(const unsigned char *in, unsigned num, Docid *out) +{ + if (num == 0) { + return in; + } + in = read_baseval(in, out++); + + for (unsigned i = 1; i < num; i += BlockSize, out += BlockSize) { + const unsigned num_this_block = std::min(num - i, BlockSize); + switch (in[0] >> 6) { + case BlockType::FOR: + dprintf("%d+%d: blocktype=%d (for), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f); + in = decode_for(in, num_this_block, out); + break; + case BlockType::PFOR_VB: + dprintf("%d+%d: blocktype=%d (pfor + vb), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f); + in = decode_pfor_vb(in, num_this_block, out); + break; + case BlockType::PFOR_BITMAP: + dprintf("%d+%d: blocktype=%d (pfor + bitmap), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f); + in = decode_pfor_bitmap(in, num_this_block, out); + break; + case BlockType::CONSTANT: + dprintf("%d+%d: blocktype=%d (constant), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f); + in = decode_constant(in, num_this_block, out); + break; + } + } + + return in; +} + +#endif // !defined(_TURBOPFOR_H) -- 2.39.2