From e77d4b9e6bb9e67dbf3291aec88f9c342c03a6fa Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Mon, 5 Oct 2020 21:49:05 +0200 Subject: [PATCH] Support decoding the SIMD interleaved TurboPFor formats. Our decoder is even slower for these than for the regular formats, but at least it allows us to switch the format back. We'll see about some mild optimization next. --- bench.cpp | 10 +-- plocate-build.cpp | 4 +- plocate.cpp | 4 +- turbopfor.h | 178 ++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 180 insertions(+), 16 deletions(-) diff --git a/bench.cpp b/bench.cpp index d9704ef..03b3967 100644 --- a/bench.cpp +++ b/bench.cpp @@ -52,8 +52,8 @@ int main(void) out1.resize(num_docids + 128); out2.resize(num_docids + 128); unsigned char *pldata = reinterpret_cast(&pl[0]); - p4nd1dec32(pldata, num_docids, &out1[0]); - decode_pfor_delta1<128>(pldata, num_docids, &out2[0]); + p4nd1dec128v32(pldata, num_docids, &out1[0]); + decode_pfor_delta1<128>(pldata, num_docids, /*interleaved=*/true, &out2[0]); for (unsigned i = 0; i < num_docids; ++i) { if (out1[i] != out2[i]) { if (++num_errors < 10) { @@ -72,7 +72,7 @@ int main(void) steady_clock::time_point start = steady_clock::now(); for (auto &[pl, num_docids] : posting_lists) { unsigned char *pldata = reinterpret_cast(&pl[0]); - p4nd1dec32(pldata, num_docids, &dummy[0]); + p4nd1dec128v32(pldata, num_docids, &dummy[0]); } steady_clock::time_point end = steady_clock::now(); double reference_sec = duration(end - start).count(); @@ -81,9 +81,9 @@ int main(void) start = steady_clock::now(); for (auto &[pl, num_docids] : posting_lists) { unsigned char *pldata = reinterpret_cast(&pl[0]); - decode_pfor_delta1<128>(pldata, num_docids, &dummy[0]); + decode_pfor_delta1<128>(pldata, num_docids, /*interleaved=*/true, &dummy[0]); } end = steady_clock::now(); double own_sec = duration(end - start).count(); - fprintf(stderr, "Decoding with own implementation: %.1f ms (%.2f%% speed)\n", 1e3 * own_sec, 100.0 * reference_sec / own_sec); + fprintf(stderr, "Decoding with own implementation: %.3f ms (%.2f%% speed)\n", 1e3 * own_sec, 100.0 * reference_sec / own_sec); } diff --git a/plocate-build.cpp b/plocate-build.cpp index d859430..0a19c1f 100644 --- a/plocate-build.cpp +++ b/plocate-build.cpp @@ -130,14 +130,14 @@ void PostingListBuilder::append_block() { unsigned char buf[P4NENC_BOUND(128)]; assert(pending_docids.size() == 128); - unsigned char *end = p4d1enc32(pending_docids.data(), 128, buf, last_block_end); + unsigned char *end = p4d1enc128v32(pending_docids.data(), 128, buf, last_block_end); encoded.append(reinterpret_cast(buf), reinterpret_cast(end)); } void PostingListBuilder::write_header(uint32_t docid) { unsigned char buf[P4NENC_BOUND(1)]; - size_t bytes = p4nd1enc32(&docid, 1, buf); + size_t bytes = p4nd1enc128v32(&docid, 1, buf); encoded.append(reinterpret_cast(buf), bytes); } diff --git a/plocate.cpp b/plocate.cpp index 9b1f234..2c5514e 100644 --- a/plocate.cpp +++ b/plocate.cpp @@ -391,7 +391,7 @@ void do_search_file(const vector &needles, const char *filename) unsigned char *pldata = reinterpret_cast(s.data()); if (in1.empty()) { in1.resize(num + 128); - decode_pfor_delta1<128>(pldata, num, &in1[0]); + decode_pfor_delta1<128>(pldata, num, /*interleaved=*/true, &in1[0]); in1.resize(num); dprintf("trigram '%c%c%c' (%zu bytes) decoded to %zu entries\n", trgm & 0xff, (trgm >> 8) & 0xff, (trgm >> 16) & 0xff, len, num); @@ -399,7 +399,7 @@ void do_search_file(const vector &needles, const char *filename) if (in2.size() < num + 128) { in2.resize(num + 128); } - decode_pfor_delta1<128>(pldata, num, &in2[0]); + decode_pfor_delta1<128>(pldata, num, /*interleaved=*/true, &in2[0]); out.clear(); set_intersection(in1.begin(), in1.end(), in2.begin(), in2.begin() + num, diff --git a/turbopfor.h b/turbopfor.h index a7913db..12150f9 100644 --- a/turbopfor.h +++ b/turbopfor.h @@ -107,6 +107,35 @@ private: unsigned bits_used = 0; }; +template +struct InterleavedBitReader { +public: + InterleavedBitReader(const unsigned char *in, unsigned bits) + : in(in), bits(bits), mask((1U << bits) - 1) {} + uint32_t read() + { + uint32_t val; + if (bits_used + bits > 32) { + val = (read_le(in) >> bits_used) | (read_le(in + Stride) << (32 - bits_used)); + } else { + val = (read_le(in) >> bits_used); + } + + bits_used += bits; + in += Stride * (bits_used / 32); + bits_used %= 32; + + return val & mask; + } + +private: + static constexpr unsigned Stride = NumStreams * sizeof(uint32_t); + const unsigned char *in; + const unsigned bits; + const unsigned mask; + unsigned bits_used = 0; +}; + // Does not properly account for overflow. inline unsigned div_round_up(unsigned val, unsigned div) { @@ -155,6 +184,30 @@ const unsigned char *decode_for(const unsigned char *in, unsigned num, Docid *ou return in + bytes_for_packed_bits(num, bit_width); } +// Like decode_for(), but the values are organized in four independent streams, +// for SIMD (presumably SSE2). Supports a whole block only. +template +const unsigned char *decode_for_interleaved(const unsigned char *in, Docid *out) +{ + const unsigned bit_width = *in++ & 0x3f; + + InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width); + InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width); + InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width); + InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width); + for (unsigned i = 0; i < BlockSize / 4; ++i) { + out[i * 4 + 0] = bs0.read(); + out[i * 4 + 1] = bs1.read(); + out[i * 4 + 2] = bs2.read(); + out[i * 4 + 3] = bs3.read(); + } + Docid *prev_out = out - 1; + for (unsigned i = 0; i < BlockSize; ++i) { + out[i] += prev_out[i] + 1; + } + return in + bytes_for_packed_bits(BlockSize, bit_width); +} + // PFor block with bitmap exceptions. Layout: // // - Bit width (6 bits) | type << 6 @@ -201,6 +254,52 @@ const unsigned char *decode_pfor_bitmap(const unsigned char *in, unsigned num, D return in + bytes_for_packed_bits(num, bit_width); } +// Like decode_pfor_bitmap(), but the base values are organized in four +// independent streams, for SIMD (presumably SSE2). Supports a whole block only. +template +const unsigned char *decode_pfor_bitmap_interleaved(const unsigned char *in, Docid *out) +{ + memset(out, 0, BlockSize * sizeof(Docid)); + + const unsigned bit_width = *in++ & 0x3f; + const unsigned exception_bit_width = *in++; + + // Decode exceptions. + { + const uint64_t *exception_bitmap_ptr = reinterpret_cast(in); + in += div_round_up(BlockSize, 8); + + int num_exceptions = 0; + + BitReader bs(in, exception_bit_width); + for (unsigned i = 0; i < BlockSize; i += 64, ++exception_bitmap_ptr) { + uint64_t exceptions = read_le(exception_bitmap_ptr); + for (; exceptions != 0; exceptions &= exceptions - 1, ++num_exceptions) { + unsigned idx = (ffsll(exceptions) - 1) + i; + out[idx] = bs.read() << bit_width; + } + } + in += bytes_for_packed_bits(num_exceptions, exception_bit_width); + } + + // Decode the base values, and delta-decode. + InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width); + InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width); + InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width); + InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width); + for (unsigned i = 0; i < BlockSize / 4; ++i) { + out[i * 4 + 0] |= bs0.read(); + out[i * 4 + 1] |= bs1.read(); + out[i * 4 + 2] |= bs2.read(); + out[i * 4 + 3] |= bs3.read(); + } + Docid *prev_out = out - 1; + for (unsigned i = 0; i < BlockSize; ++i) { + out[i] += prev_out[i] + 1; + } + return in + bytes_for_packed_bits(BlockSize, bit_width); +} + // PFor block with variable-byte exceptions. Layout: // // - Bit width (6 bits) | type << 6 @@ -253,6 +352,56 @@ const unsigned char *decode_pfor_vb(const unsigned char *in, unsigned num, Docid return in; } +// Like decode_pfor_vb(), but the base values are organized in four +// independent streams, for SIMD (presumably SSE2). Supports a whole block only. +template +const unsigned char *decode_pfor_vb_interleaved(const unsigned char *in, Docid *out) +{ + const unsigned bit_width = *in++ & 0x3f; + unsigned num_exceptions = *in++; + + // Decode the base values. + InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width); + InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width); + InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width); + InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width); + for (unsigned i = 0; i < BlockSize / 4; ++i) { + out[i * 4 + 0] = bs0.read(); + out[i * 4 + 1] = bs1.read(); + out[i * 4 + 2] = bs2.read(); + out[i * 4 + 3] = bs3.read(); + } + in += bytes_for_packed_bits(BlockSize, bit_width); + + // Decode exceptions. + Docid exceptions[BlockSize]; + if (*in == 255) { + ++in; + for (unsigned i = 0; i < num_exceptions; ++i) { + exceptions[i] = read_le(in); + in += sizeof(Docid); + } + } else { + for (unsigned i = 0; i < num_exceptions; ++i) { + in = read_vb(in, &exceptions[i]); + } + } + + // Apply exceptions. + for (unsigned i = 0; i < num_exceptions; ++i) { + unsigned idx = *in++; + out[idx] |= exceptions[i] << bit_width; + } + + // Delta-decode. + Docid *prev_out = out - 1; + for (unsigned i = 0; i < BlockSize; ++i) { + out[i] = out[i] + prev_out[i] + 1; + } + + return in; +} + enum BlockType { FOR = 0, PFOR_VB = 1, @@ -261,7 +410,7 @@ enum BlockType { }; template -const unsigned char *decode_pfor_delta1(const unsigned char *in, unsigned num, Docid *out) +const unsigned char *decode_pfor_delta1(const unsigned char *in, unsigned num, bool interleaved, Docid *out) { if (num == 0) { return in; @@ -272,16 +421,31 @@ const unsigned char *decode_pfor_delta1(const unsigned char *in, unsigned num, D const unsigned num_this_block = std::min(num - i, BlockSize); switch (in[0] >> 6) { case BlockType::FOR: - dprintf("%d+%d: blocktype=%d (for), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f); - in = decode_for(in, num_this_block, out); + if (interleaved && num_this_block == BlockSize) { + dprintf("%d+%d: blocktype=%d (for, interleaved), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f); + in = decode_for_interleaved(in, out); + } else { + dprintf("%d+%d: blocktype=%d (for), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f); + in = decode_for(in, num_this_block, out); + } break; case BlockType::PFOR_VB: - dprintf("%d+%d: blocktype=%d (pfor + vb), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f); - in = decode_pfor_vb(in, num_this_block, out); + if (interleaved && num_this_block == BlockSize) { + dprintf("%d+%d: blocktype=%d (pfor + vb, interleaved), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f); + in = decode_pfor_vb_interleaved(in, out); + } else { + dprintf("%d+%d: blocktype=%d (pfor + vb), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f); + in = decode_pfor_vb(in, num_this_block, out); + } break; case BlockType::PFOR_BITMAP: - dprintf("%d+%d: blocktype=%d (pfor + bitmap), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f); - in = decode_pfor_bitmap(in, num_this_block, out); + if (interleaved && num_this_block == BlockSize) { + dprintf("%d+%d: blocktype=%d (pfor + bitmap, interleaved), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f); + in = decode_pfor_bitmap_interleaved(in, out); + } else { + dprintf("%d+%d: blocktype=%d (pfor + bitmap), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f); + in = decode_pfor_bitmap(in, num_this_block, out); + } break; case BlockType::CONSTANT: dprintf("%d+%d: blocktype=%d (constant), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f); -- 2.39.2