]> git.sesse.net Git - plocate/commitdiff
Support decoding the SIMD interleaved TurboPFor formats.
authorSteinar H. Gunderson <steinar+git@gunderson.no>
Mon, 5 Oct 2020 19:49:05 +0000 (21:49 +0200)
committerSteinar H. Gunderson <steinar+git@gunderson.no>
Mon, 5 Oct 2020 19:50:54 +0000 (21:50 +0200)
Our decoder is even slower for these than for the regular formats,
but at least it allows us to switch the format back. We'll see about
some mild optimization next.

bench.cpp
plocate-build.cpp
plocate.cpp
turbopfor.h

index d9704ef53a71ca203578047ec372605837e81feb..03b3967ac4ab4eddb8752e5028795ebc4f93191b 100644 (file)
--- a/bench.cpp
+++ b/bench.cpp
@@ -52,8 +52,8 @@ int main(void)
                out1.resize(num_docids + 128);
                out2.resize(num_docids + 128);
                unsigned char *pldata = reinterpret_cast<unsigned char *>(&pl[0]);
-               p4nd1dec32(pldata, num_docids, &out1[0]);
-               decode_pfor_delta1<128>(pldata, num_docids, &out2[0]);
+               p4nd1dec128v32(pldata, num_docids, &out1[0]);
+               decode_pfor_delta1<128>(pldata, num_docids, /*interleaved=*/true, &out2[0]);
                for (unsigned i = 0; i < num_docids; ++i) {
                        if (out1[i] != out2[i]) {
                                if (++num_errors < 10) {
@@ -72,7 +72,7 @@ int main(void)
        steady_clock::time_point start = steady_clock::now();
        for (auto &[pl, num_docids] : posting_lists) {
                unsigned char *pldata = reinterpret_cast<unsigned char *>(&pl[0]);
-               p4nd1dec32(pldata, num_docids, &dummy[0]);
+               p4nd1dec128v32(pldata, num_docids, &dummy[0]);
        }
        steady_clock::time_point end = steady_clock::now();
        double reference_sec = duration<double>(end - start).count();
@@ -81,9 +81,9 @@ int main(void)
        start = steady_clock::now();
        for (auto &[pl, num_docids] : posting_lists) {
                unsigned char *pldata = reinterpret_cast<unsigned char *>(&pl[0]);
-               decode_pfor_delta1<128>(pldata, num_docids, &dummy[0]);
+               decode_pfor_delta1<128>(pldata, num_docids, /*interleaved=*/true, &dummy[0]);
        }
        end = steady_clock::now();
        double own_sec = duration<double>(end - start).count();
-       fprintf(stderr, "Decoding with own implementation: %.1f ms (%.2f%% speed)\n", 1e3 * own_sec, 100.0 * reference_sec / own_sec);
+       fprintf(stderr, "Decoding with own implementation: %.3f ms (%.2f%% speed)\n", 1e3 * own_sec, 100.0 * reference_sec / own_sec);
 }
index d8594303ad5a4341ec551fbe9db67e3ba2815fd0..0a19c1fbab84da487a020bd372c189163ae63ff8 100644 (file)
@@ -130,14 +130,14 @@ void PostingListBuilder::append_block()
 {
        unsigned char buf[P4NENC_BOUND(128)];
        assert(pending_docids.size() == 128);
-       unsigned char *end = p4d1enc32(pending_docids.data(), 128, buf, last_block_end);
+       unsigned char *end = p4d1enc128v32(pending_docids.data(), 128, buf, last_block_end);
        encoded.append(reinterpret_cast<char *>(buf), reinterpret_cast<char *>(end));
 }
 
 void PostingListBuilder::write_header(uint32_t docid)
 {
        unsigned char buf[P4NENC_BOUND(1)];
-       size_t bytes = p4nd1enc32(&docid, 1, buf);
+       size_t bytes = p4nd1enc128v32(&docid, 1, buf);
        encoded.append(reinterpret_cast<char *>(buf), bytes);
 }
 
index 9b1f234c1b8e3653ba314f9208e172d129f09692..2c5514ee8b16bc5f3c8461a0c97821e6a2216987 100644 (file)
@@ -391,7 +391,7 @@ void do_search_file(const vector<string> &needles, const char *filename)
                        unsigned char *pldata = reinterpret_cast<unsigned char *>(s.data());
                        if (in1.empty()) {
                                in1.resize(num + 128);
-                               decode_pfor_delta1<128>(pldata, num, &in1[0]);
+                               decode_pfor_delta1<128>(pldata, num, /*interleaved=*/true, &in1[0]);
                                in1.resize(num);
                                dprintf("trigram '%c%c%c' (%zu bytes) decoded to %zu entries\n", trgm & 0xff,
                                        (trgm >> 8) & 0xff, (trgm >> 16) & 0xff, len, num);
@@ -399,7 +399,7 @@ void do_search_file(const vector<string> &needles, const char *filename)
                                if (in2.size() < num + 128) {
                                        in2.resize(num + 128);
                                }
-                               decode_pfor_delta1<128>(pldata, num, &in2[0]);
+                               decode_pfor_delta1<128>(pldata, num, /*interleaved=*/true, &in2[0]);
 
                                out.clear();
                                set_intersection(in1.begin(), in1.end(), in2.begin(), in2.begin() + num,
index a7913db3f8256fbeb5b8e07f48550c2910571dec..12150f9485aee882b561290beb5642594b0be1c0 100644 (file)
@@ -107,6 +107,35 @@ private:
        unsigned bits_used = 0;
 };
 
+template<unsigned NumStreams>
+struct InterleavedBitReader {
+public:
+       InterleavedBitReader(const unsigned char *in, unsigned bits)
+               : in(in), bits(bits), mask((1U << bits) - 1) {}
+       uint32_t read()
+       {
+               uint32_t val;
+               if (bits_used + bits > 32) {
+                       val = (read_le<uint32_t>(in) >> bits_used) | (read_le<uint32_t>(in + Stride) << (32 - bits_used));
+               } else {
+                       val = (read_le<uint32_t>(in) >> bits_used);
+               }
+
+               bits_used += bits;
+               in += Stride * (bits_used / 32);
+               bits_used %= 32;
+
+               return val & mask;
+       }
+
+private:
+       static constexpr unsigned Stride = NumStreams * sizeof(uint32_t);
+       const unsigned char *in;
+       const unsigned bits;
+       const unsigned mask;
+       unsigned bits_used = 0;
+};
+
 // Does not properly account for overflow.
 inline unsigned div_round_up(unsigned val, unsigned div)
 {
@@ -155,6 +184,30 @@ const unsigned char *decode_for(const unsigned char *in, unsigned num, Docid *ou
        return in + bytes_for_packed_bits(num, bit_width);
 }
 
+// Like decode_for(), but the values are organized in four independent streams,
+// for SIMD (presumably SSE2). Supports a whole block only.
+template<unsigned BlockSize, class Docid>
+const unsigned char *decode_for_interleaved(const unsigned char *in, Docid *out)
+{
+       const unsigned bit_width = *in++ & 0x3f;
+
+       InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width);
+       InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width);
+       InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width);
+       InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width);
+       for (unsigned i = 0; i < BlockSize / 4; ++i) {
+               out[i * 4 + 0] = bs0.read();
+               out[i * 4 + 1] = bs1.read();
+               out[i * 4 + 2] = bs2.read();
+               out[i * 4 + 3] = bs3.read();
+       }
+       Docid *prev_out = out - 1;
+       for (unsigned i = 0; i < BlockSize; ++i) {
+               out[i] += prev_out[i] + 1;
+       }
+       return in + bytes_for_packed_bits(BlockSize, bit_width);
+}
+
 // PFor block with bitmap exceptions. Layout:
 //
 //  - Bit width (6 bits) | type << 6
@@ -201,6 +254,52 @@ const unsigned char *decode_pfor_bitmap(const unsigned char *in, unsigned num, D
        return in + bytes_for_packed_bits(num, bit_width);
 }
 
+// Like decode_pfor_bitmap(), but the base values are organized in four
+// independent streams, for SIMD (presumably SSE2). Supports a whole block only.
+template<unsigned BlockSize, class Docid>
+const unsigned char *decode_pfor_bitmap_interleaved(const unsigned char *in, Docid *out)
+{
+       memset(out, 0, BlockSize * sizeof(Docid));
+
+       const unsigned bit_width = *in++ & 0x3f;
+       const unsigned exception_bit_width = *in++;
+
+       // Decode exceptions.
+       {
+               const uint64_t *exception_bitmap_ptr = reinterpret_cast<const uint64_t *>(in);
+               in += div_round_up(BlockSize, 8);
+
+               int num_exceptions = 0;
+
+               BitReader bs(in, exception_bit_width);
+               for (unsigned i = 0; i < BlockSize; i += 64, ++exception_bitmap_ptr) {
+                       uint64_t exceptions = read_le<uint64_t>(exception_bitmap_ptr);
+                       for (; exceptions != 0; exceptions &= exceptions - 1, ++num_exceptions) {
+                               unsigned idx = (ffsll(exceptions) - 1) + i;
+                               out[idx] = bs.read() << bit_width;
+                       }
+               }
+               in += bytes_for_packed_bits(num_exceptions, exception_bit_width);
+       }
+
+       // Decode the base values, and delta-decode.
+       InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width);
+       InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width);
+       InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width);
+       InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width);
+       for (unsigned i = 0; i < BlockSize / 4; ++i) {
+               out[i * 4 + 0] |= bs0.read();
+               out[i * 4 + 1] |= bs1.read();
+               out[i * 4 + 2] |= bs2.read();
+               out[i * 4 + 3] |= bs3.read();
+       }
+       Docid *prev_out = out - 1;
+       for (unsigned i = 0; i < BlockSize; ++i) {
+               out[i] += prev_out[i] + 1;
+       }
+       return in + bytes_for_packed_bits(BlockSize, bit_width);
+}
+
 // PFor block with variable-byte exceptions. Layout:
 //
 //  - Bit width (6 bits) | type << 6
@@ -253,6 +352,56 @@ const unsigned char *decode_pfor_vb(const unsigned char *in, unsigned num, Docid
        return in;
 }
 
+// Like decode_pfor_vb(), but the base values are organized in four
+// independent streams, for SIMD (presumably SSE2). Supports a whole block only.
+template<unsigned BlockSize, class Docid>
+const unsigned char *decode_pfor_vb_interleaved(const unsigned char *in, Docid *out)
+{
+       const unsigned bit_width = *in++ & 0x3f;
+       unsigned num_exceptions = *in++;
+
+       // Decode the base values.
+       InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width);
+       InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width);
+       InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width);
+       InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width);
+       for (unsigned i = 0; i < BlockSize / 4; ++i) {
+               out[i * 4 + 0] = bs0.read();
+               out[i * 4 + 1] = bs1.read();
+               out[i * 4 + 2] = bs2.read();
+               out[i * 4 + 3] = bs3.read();
+       }
+       in += bytes_for_packed_bits(BlockSize, bit_width);
+
+       // Decode exceptions.
+       Docid exceptions[BlockSize];
+       if (*in == 255) {
+               ++in;
+               for (unsigned i = 0; i < num_exceptions; ++i) {
+                       exceptions[i] = read_le<Docid>(in);
+                       in += sizeof(Docid);
+               }
+       } else {
+               for (unsigned i = 0; i < num_exceptions; ++i) {
+                       in = read_vb(in, &exceptions[i]);
+               }
+       }
+
+       // Apply exceptions.
+       for (unsigned i = 0; i < num_exceptions; ++i) {
+               unsigned idx = *in++;
+               out[idx] |= exceptions[i] << bit_width;
+       }
+
+       // Delta-decode.
+       Docid *prev_out = out - 1;
+       for (unsigned i = 0; i < BlockSize; ++i) {
+               out[i] = out[i] + prev_out[i] + 1;
+       }
+
+       return in;
+}
+
 enum BlockType {
        FOR = 0,
        PFOR_VB = 1,
@@ -261,7 +410,7 @@ enum BlockType {
 };
 
 template<unsigned BlockSize, class Docid>
-const unsigned char *decode_pfor_delta1(const unsigned char *in, unsigned num, Docid *out)
+const unsigned char *decode_pfor_delta1(const unsigned char *in, unsigned num, bool interleaved, Docid *out)
 {
        if (num == 0) {
                return in;
@@ -272,16 +421,31 @@ const unsigned char *decode_pfor_delta1(const unsigned char *in, unsigned num, D
                const unsigned num_this_block = std::min<unsigned>(num - i, BlockSize);
                switch (in[0] >> 6) {
                case BlockType::FOR:
-                       dprintf("%d+%d: blocktype=%d (for), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
-                       in = decode_for(in, num_this_block, out);
+                       if (interleaved && num_this_block == BlockSize) {
+                               dprintf("%d+%d: blocktype=%d (for, interleaved), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
+                               in = decode_for_interleaved<BlockSize>(in, out);
+                       } else {
+                               dprintf("%d+%d: blocktype=%d (for), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
+                               in = decode_for(in, num_this_block, out);
+                       }
                        break;
                case BlockType::PFOR_VB:
-                       dprintf("%d+%d: blocktype=%d (pfor + vb), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
-                       in = decode_pfor_vb<BlockSize>(in, num_this_block, out);
+                       if (interleaved && num_this_block == BlockSize) {
+                               dprintf("%d+%d: blocktype=%d (pfor + vb, interleaved), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
+                               in = decode_pfor_vb_interleaved<BlockSize>(in, out);
+                       } else {
+                               dprintf("%d+%d: blocktype=%d (pfor + vb), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
+                               in = decode_pfor_vb<BlockSize>(in, num_this_block, out);
+                       }
                        break;
                case BlockType::PFOR_BITMAP:
-                       dprintf("%d+%d: blocktype=%d (pfor + bitmap), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
-                       in = decode_pfor_bitmap(in, num_this_block, out);
+                       if (interleaved && num_this_block == BlockSize) {
+                               dprintf("%d+%d: blocktype=%d (pfor + bitmap, interleaved), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
+                               in = decode_pfor_bitmap_interleaved<BlockSize>(in, out);
+                       } else {
+                               dprintf("%d+%d: blocktype=%d (pfor + bitmap), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
+                               in = decode_pfor_bitmap(in, num_this_block, out);
+                       }
                        break;
                case BlockType::CONSTANT:
                        dprintf("%d+%d: blocktype=%d (constant), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);