]> git.sesse.net Git - plocate/commitdiff
Move TurboPFor compilation to its own compilation unit.
authorSteinar H. Gunderson <steinar+git@gunderson.no>
Thu, 8 Oct 2020 21:57:23 +0000 (23:57 +0200)
committerSteinar H. Gunderson <steinar+git@gunderson.no>
Thu, 8 Oct 2020 21:57:23 +0000 (23:57 +0200)
This file takes so long to compile, especially with optimization
and/or ASan on, that it became a real annoyance whenever we were
modifying plocate.cpp for anything else. Takes away some genericness
we don't really use.

We could do the same thing with the encoder if need be.

meson.build
plocate.cpp
turbopfor.cpp [new file with mode: 0644]
turbopfor.h

index e8c37f04f7d3a13a930bcc0e552d08667f4e47cd..eca029769cc3a5de48cce87d5d3b54307234454d 100644 (file)
@@ -8,7 +8,7 @@ if not uringdep.found()
        add_project_arguments('-DWITHOUT_URING', language: 'cpp')
 endif
 
-executable('plocate', ['plocate.cpp', 'io_uring_engine.cpp'],
+executable('plocate', ['plocate.cpp', 'io_uring_engine.cpp', 'turbopfor.cpp'],
        dependencies: [uringdep, zstddep],
        install: true,
        install_mode: ['rwxr-sr-x', 'root', 'mlocate'])
index 508525c475befb644738ef519c83cfc31ac84c0c..9a5c4cdbe8ba6e05bb096375f3625b7454a4d735 100644 (file)
@@ -406,7 +406,7 @@ void do_search_file(const vector<string> &needles, const char *filename)
                        const unsigned char *pldata = reinterpret_cast<const unsigned char *>(s.data());
                        if (in1.empty()) {
                                in1.resize(num + 128);
-                               decode_pfor_delta1<128>(pldata, num, /*interleaved=*/true, &in1[0]);
+                               decode_pfor_delta1_128(pldata, num, /*interleaved=*/true, &in1[0]);
                                in1.resize(num);
                                dprintf("trigram '%c%c%c' (%zu bytes) decoded to %zu entries\n", trgm & 0xff,
                                        (trgm >> 8) & 0xff, (trgm >> 16) & 0xff, len, num);
@@ -414,7 +414,7 @@ void do_search_file(const vector<string> &needles, const char *filename)
                                if (in2.size() < num + 128) {
                                        in2.resize(num + 128);
                                }
-                               decode_pfor_delta1<128>(pldata, num, /*interleaved=*/true, &in2[0]);
+                               decode_pfor_delta1_128(pldata, num, /*interleaved=*/true, &in2[0]);
 
                                out.clear();
                                set_intersection(in1.begin(), in1.end(), in2.begin(), in2.begin() + num,
diff --git a/turbopfor.cpp b/turbopfor.cpp
new file mode 100644 (file)
index 0000000..2dff20e
--- /dev/null
@@ -0,0 +1,785 @@
+#include <algorithm>
+#include <assert.h>
+#include <endian.h>
+#include <limits.h>
+#include <stdint.h>
+#include <string.h>
+
+#if defined(__i386__) || defined(__x86_64__)
+#define COULD_HAVE_SSE2
+#include <immintrin.h>
+#endif
+
+#include "turbopfor-common.h"
+
+#define dprintf(...)
+//#define dprintf(...) fprintf(stderr, __VA_ARGS__);
+
+// Forward declarations to declare to the template code below that they exist.
+// (These must seemingly be non-templates for function multiversioning to work.)
+__attribute__((target("default")))
+const unsigned char *
+decode_for_interleaved_128_32(const unsigned char *in, uint32_t *out);
+__attribute__((target("default")))
+const unsigned char *
+decode_pfor_bitmap_interleaved_128_32(const unsigned char *in, uint32_t *out);
+__attribute__((target("default")))
+const unsigned char *
+decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out);
+
+#ifdef COULD_HAVE_SSE2
+__attribute__((target("sse2")))
+const unsigned char *
+decode_for_interleaved_128_32(const unsigned char *in, uint32_t *out);
+__attribute__((target("sse2")))
+const unsigned char *
+decode_pfor_bitmap_interleaved_128_32(const unsigned char *in, uint32_t *out);
+__attribute__((target("sse2")))
+const unsigned char *
+decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out);
+#endif
+
+template<class Docid>
+Docid read_le(const void *in)
+{
+       Docid val;
+       memcpy(&val, in, sizeof(val));
+       if constexpr (sizeof(Docid) == 8) {
+               return le64toh(val);
+       } else if constexpr (sizeof(Docid) == 4) {
+               return le32toh(val);
+       } else if constexpr (sizeof(Docid) == 2) {
+               return le16toh(val);
+       } else if constexpr (sizeof(Docid) == 1) {
+               return val;
+       } else {
+               assert(false);
+       }
+}
+
+// Reads a single value with an encoding that looks a bit like PrefixVarint.
+// It's unclear why this doesn't use the varbyte encoding.
+template<class Docid>
+const unsigned char *read_baseval(const unsigned char *in, Docid *out)
+{
+       //fprintf(stderr, "baseval: 0x%02x 0x%02x 0x%02x 0x%02x\n", in[0], in[1], in[2], in[3]);
+       if (*in < 128) {
+               *out = *in;
+               return in + 1;
+       } else if (*in < 192) {
+               *out = ((uint32_t(in[0]) << 8) | uint32_t(in[1])) & 0x3fff;
+               return in + 2;
+       } else if (*in < 224) {
+               *out = ((uint32_t(in[0]) << 16) |
+                       (uint32_t(in[2]) << 8) |
+                       (uint32_t(in[1]))) & 0x1fffff;
+               return in + 3;
+       } else {
+               assert(false);  // Not implemented.
+       }
+}
+
+// Does not read past the end of the input.
+template<class Docid>
+const unsigned char *read_vb(const unsigned char *in, Docid *out)
+{
+       if (*in <= 176) {
+               *out = *in;
+               return in + 1;
+       } else if (*in <= 240) {
+               *out = ((uint32_t(in[0] - 177) << 8) | uint32_t(in[1])) + 177;
+               return in + 2;
+       } else if (*in <= 248) {
+               *out = ((uint32_t(in[0] - 241) << 16) | read_le<uint16_t>(in + 1)) + 16561;
+               return in + 3;
+       } else if (*in == 249) {
+               *out = (uint32_t(in[1])) |
+                       (uint32_t(in[2]) << 8) |
+                       (uint32_t(in[3]) << 16);
+               return in + 4;
+       } else if (*in == 250) {
+               *out = read_le<uint32_t>(in + 1);
+               return in + 5;
+       } else {
+               assert(false);
+       }
+}
+
+struct BitReader {
+public:
+       BitReader(const unsigned char *in, unsigned bits)
+               : in(in), bits(bits), mask(mask_for_bits(bits)) {}
+
+       // Can read 4 bytes past the end of the input (if bits_used == 0).
+       uint32_t read()
+       {
+               uint32_t val = (read_le<uint32_t>(in) >> bits_used) & mask;
+
+               bits_used += bits;
+               in += bits_used / 8;
+               bits_used %= 8;
+
+               return val;
+       }
+
+private:
+       const unsigned char *in;
+       const unsigned bits;
+       const unsigned mask;
+       unsigned bits_used = 0;
+};
+
+template<unsigned NumStreams>
+struct InterleavedBitReader {
+public:
+       InterleavedBitReader(const unsigned char *in, unsigned bits)
+               : in(in), bits(bits), mask(mask_for_bits(bits)) {}
+
+       // Can read 4 bytes past the end of the input (if bit_width == 0).
+       uint32_t read()
+       {
+               uint32_t val;
+               if (bits_used + bits > 32) {
+                       val = (read_le<uint32_t>(in) >> bits_used) | (read_le<uint32_t>(in + Stride) << (32 - bits_used));
+               } else {
+                       val = (read_le<uint32_t>(in) >> bits_used);
+               }
+
+               bits_used += bits;
+               in += Stride * (bits_used / 32);
+               bits_used %= 32;
+
+               return val & mask;
+       }
+
+private:
+       static constexpr unsigned Stride = NumStreams * sizeof(uint32_t);
+       const unsigned char *in;
+       const unsigned bits;
+       const unsigned mask;
+       unsigned bits_used = 0;
+};
+
+#ifdef COULD_HAVE_SSE2
+struct InterleavedBitReaderSSE2 {
+public:
+       __attribute__((target("sse2")))
+       InterleavedBitReaderSSE2(const unsigned char *in, unsigned bits)
+               : in(reinterpret_cast<const __m128i *>(in)), bits(bits), mask(_mm_set1_epi32(mask_for_bits(bits))) {}
+
+       // Can read 16 bytes past the end of the input (if bit_width == 0).
+       __attribute__((target("sse2")))
+       __m128i
+       read()
+       {
+               __m128i val = _mm_srli_epi32(_mm_loadu_si128(in), bits_used);
+               if (bits_used + bits > 32) {
+                       __m128i val_upper = _mm_slli_epi32(_mm_loadu_si128(in + 1), 32 - bits_used);
+                       val = _mm_or_si128(val, val_upper);
+               }
+               val = _mm_and_si128(val, mask);
+
+               bits_used += bits;
+               in += bits_used / 32;
+               bits_used %= 32;
+               return val;
+       }
+
+private:
+       const __m128i *in;
+       const unsigned bits;
+       const __m128i mask;
+       unsigned bits_used = 0;
+};
+#endif
+
+// Constant block. Layout:
+//
+//  - Bit width (6 bits) | type << 6
+//  - Base values (<bits> bits, rounded up to nearest byte)
+//
+// Can read 4 bytes past the end of the input (if bit_width == 0).
+template<class Docid>
+const unsigned char *decode_constant(const unsigned char *in, unsigned num, Docid *out)
+{
+       const unsigned bit_width = *in++ & 0x3f;
+       Docid val = read_le<Docid>(in);
+       if (bit_width < sizeof(Docid) * 8) {
+               val &= mask_for_bits(bit_width);
+       }
+
+       Docid prev_val = out[-1];
+       for (unsigned i = 0; i < num; ++i) {
+               out[i] = prev_val = val + prev_val + 1;
+       }
+       return in + div_round_up(bit_width, 8);
+}
+
+// FOR block (ie., PFor without exceptions). Layout:
+//
+//  - Bit width (6 bits) | type << 6
+//  - Base values (<num> values of <bits> bits, rounded up to a multiple of 32 values)
+//
+// Can read 4 bytes past the end of the input (inherit from BitReader).
+template<class Docid>
+const unsigned char *decode_for(const unsigned char *in, unsigned num, Docid *out)
+{
+       const unsigned bit_width = *in++ & 0x3f;
+
+       Docid prev_val = out[-1];
+       BitReader bs(in, bit_width);
+       for (unsigned i = 0; i < num; ++i) {
+               prev_val = out[i] = bs.read() + prev_val + 1;
+       }
+       return in + bytes_for_packed_bits(num, bit_width);
+}
+
+#ifdef COULD_HAVE_SSE2
+class DeltaDecoderSSE2 {
+public:
+       __attribute__((target("sse2")))
+       DeltaDecoderSSE2(uint32_t prev_val)
+               : prev_val(_mm_set1_epi32(prev_val)) {}
+
+       __attribute__((target("sse2")))
+       __m128i
+       decode(__m128i val)
+       {
+               val = _mm_add_epi32(val, _mm_slli_si128(val, 4));
+               val = _mm_add_epi32(val, _mm_slli_si128(val, 8));
+               val = _mm_add_epi32(val, _mm_add_epi32(prev_val, delta));
+               prev_val = _mm_shuffle_epi32(val, _MM_SHUFFLE(3, 3, 3, 3));
+               return val;
+       }
+
+private:
+       // Use 4/3/2/1 as delta instead of fixed 1, so that we can do the prev_val + delta
+       // in parallel with something else.
+       const __m128i delta = _mm_set_epi32(4, 3, 2, 1);
+
+       __m128i prev_val;
+};
+
+template<unsigned BlockSize>
+__attribute__((target("sse2"))) inline void delta_decode_sse2(uint32_t *out)
+{
+       DeltaDecoderSSE2 delta(out[-1]);
+       __m128i *outvec = reinterpret_cast<__m128i *>(out);
+       for (unsigned i = 0; i < BlockSize / 4; ++i) {
+               __m128i val = _mm_loadu_si128(outvec + i);
+               _mm_storeu_si128(outvec + i, delta.decode(val));
+       }
+}
+
+// Can read 16 bytes past the end of its input (inherit from InterleavedBitReaderSSE2).
+template<unsigned BlockSize, bool OrWithExisting, bool DeltaDecode, unsigned bit_width>
+__attribute__((target("sse2")))
+const unsigned char *
+decode_bitmap_sse2_unrolled(const unsigned char *in, uint32_t *out)
+{
+       __m128i *outvec = reinterpret_cast<__m128i *>(out);
+       DeltaDecoderSSE2 delta(out[-1]);
+       InterleavedBitReaderSSE2 bs(in, bit_width);
+#pragma GCC unroll 32
+       for (unsigned i = 0; i < BlockSize / 4; ++i) {
+               __m128i val = bs.read();
+               if constexpr (OrWithExisting) {
+                       val = _mm_or_si128(val, _mm_slli_epi32(_mm_loadu_si128(outvec + i), bit_width));
+               }
+               if constexpr (DeltaDecode) {
+                       val = delta.decode(val);
+               }
+               _mm_storeu_si128(outvec + i, val);
+       }
+       in += bytes_for_packed_bits(BlockSize, bit_width);
+       return in;
+}
+
+// Can read 16 bytes past the end of its input (inherit from InterleavedBitReaderSSE2).
+template<unsigned BlockSize, bool OrWithExisting, bool DeltaDecode>
+__attribute__((target("sse2")))
+const unsigned char *
+decode_bitmap_sse2(const unsigned char *in, unsigned bit_width, uint32_t *out)
+{
+       switch (bit_width) {
+       case 0:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 0>(in, out);
+       case 1:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 1>(in, out);
+       case 2:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 2>(in, out);
+       case 3:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 3>(in, out);
+       case 4:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 4>(in, out);
+       case 5:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 5>(in, out);
+       case 6:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 6>(in, out);
+       case 7:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 7>(in, out);
+       case 8:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 8>(in, out);
+       case 9:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 9>(in, out);
+       case 10:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 10>(in, out);
+       case 11:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 11>(in, out);
+       case 12:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 12>(in, out);
+       case 13:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 13>(in, out);
+       case 14:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 14>(in, out);
+       case 15:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 15>(in, out);
+       case 16:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 16>(in, out);
+       case 17:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 17>(in, out);
+       case 18:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 18>(in, out);
+       case 19:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 19>(in, out);
+       case 20:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 20>(in, out);
+       case 21:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 21>(in, out);
+       case 22:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 22>(in, out);
+       case 23:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 23>(in, out);
+       case 24:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 24>(in, out);
+       case 25:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 25>(in, out);
+       case 26:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 26>(in, out);
+       case 27:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 27>(in, out);
+       case 28:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 28>(in, out);
+       case 29:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 29>(in, out);
+       case 30:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 30>(in, out);
+       case 31:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 31>(in, out);
+       case 32:
+               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 32>(in, out);
+       }
+       assert(false);
+}
+#endif
+
+// Like decode_for(), but the values are organized in four independent streams,
+// for SIMD (presumably SSE2). Supports a whole block only.
+//
+// Can read 16 bytes past the end of its input (inherit from InterleavedBitReader).
+template<unsigned BlockSize, class Docid>
+const unsigned char *decode_for_interleaved_generic(const unsigned char *in, Docid *out)
+{
+       const unsigned bit_width = *in++ & 0x3f;
+
+       InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width);
+       InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width);
+       InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width);
+       InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width);
+       for (unsigned i = 0; i < BlockSize / 4; ++i) {
+               out[i * 4 + 0] = bs0.read();
+               out[i * 4 + 1] = bs1.read();
+               out[i * 4 + 2] = bs2.read();
+               out[i * 4 + 3] = bs3.read();
+       }
+       Docid prev_val = out[-1];
+       for (unsigned i = 0; i < BlockSize; ++i) {
+               out[i] = prev_val = out[i] + prev_val + 1;
+       }
+       return in + bytes_for_packed_bits(BlockSize, bit_width);
+}
+
+// Does not read past the end of the input.
+template<unsigned BlockSize, class Docid>
+const unsigned char *decode_for_interleaved(const unsigned char *in, Docid *out)
+{
+       if constexpr (BlockSize == 128 && sizeof(Docid) == sizeof(uint32_t)) {
+               return decode_for_interleaved_128_32(in, out);
+       } else {
+               return decode_for_interleaved_generic(in, out);
+       }
+}
+
+// Does not read past the end of the input.
+__attribute__((target("default")))
+const unsigned char *
+decode_for_interleaved_128_32(const unsigned char *in, uint32_t *out)
+{
+       return decode_for_interleaved_generic<128>(in, out);
+}
+
+#ifdef COULD_HAVE_SSE2
+// Specialized version for SSE2.
+// Can read 16 bytes past the end of the input (inherit from decode_bitmap_sse2()).
+__attribute__((target("sse2")))
+const unsigned char *
+decode_for_interleaved_128_32(const unsigned char *in, uint32_t *out)
+{
+       constexpr unsigned BlockSize = 128;
+
+       const unsigned bit_width = *in++ & 0x3f;
+
+       in = decode_bitmap_sse2<BlockSize, /*OrWithExisting=*/false, /*DeltaDecode=*/true>(in, bit_width, out);
+
+       return in;
+}
+#endif
+
+// Can read 4 bytes past the end of the input (inherit from BitReader).
+template<class Docid>
+const unsigned char *decode_pfor_bitmap_exceptions(const unsigned char *in, unsigned num, Docid *out)
+{
+       const unsigned exception_bit_width = *in++;
+       const uint64_t *exception_bitmap_ptr = reinterpret_cast<const uint64_t *>(in);
+       in += div_round_up(num, 8);
+
+       int num_exceptions = 0;
+
+       BitReader bs(in, exception_bit_width);
+       for (unsigned i = 0; i < num; i += 64, ++exception_bitmap_ptr) {
+               uint64_t exceptions = read_le<uint64_t>(exception_bitmap_ptr);
+               if (num - i < 64) {
+                       // We've read some bytes past the end, so clear out the junk bits.
+                       exceptions &= (1ULL << (num - i)) - 1;
+               }
+               for (; exceptions != 0; exceptions &= exceptions - 1, ++num_exceptions) {
+                       unsigned idx = (ffsll(exceptions) - 1) + i;
+                       out[idx] = bs.read();
+               }
+       }
+       in += bytes_for_packed_bits(num_exceptions, exception_bit_width);
+       return in;
+}
+
+// PFor block with bitmap exceptions. Layout:
+//
+//  - Bit width (6 bits) | type << 6
+//  - Exception bit width (8 bits)
+//  - Bitmap of which values have exceptions (<num> bits, rounded up to a byte)
+//  - Exceptions (<num_exc> values of <bits_exc> bits, rounded up to a byte)
+//  - Base values (<num> values of <bits> bits, rounded up to a byte)
+//
+// Can read 4 bytes past the end of the input (inherit from BitReader).
+template<class Docid>
+const unsigned char *decode_pfor_bitmap(const unsigned char *in, unsigned num, Docid *out)
+{
+       memset(out, 0, num * sizeof(Docid));
+
+       const unsigned bit_width = *in++ & 0x3f;
+
+       in = decode_pfor_bitmap_exceptions(in, num, out);
+
+       // Decode the base values, and delta-decode.
+       Docid prev_val = out[-1];
+       BitReader bs(in, bit_width);
+       for (unsigned i = 0; i < num; ++i) {
+               out[i] = prev_val = ((out[i] << bit_width) | bs.read()) + prev_val + 1;
+       }
+       return in + bytes_for_packed_bits(num, bit_width);
+}
+
+// Like decode_pfor_bitmap(), but the base values are organized in four
+// independent streams, for SIMD (presumably SSE2). Supports a whole block only.
+//
+// Can read 16 bytes past the end of the input (inherit from InterleavedBitReader
+// and decode_pfor_bitmap_exceptions()).
+template<unsigned BlockSize, class Docid>
+const unsigned char *decode_pfor_bitmap_interleaved_generic(const unsigned char *in, Docid *out)
+{
+       memset(out, 0, BlockSize * sizeof(Docid));
+
+       const unsigned bit_width = *in++ & 0x3f;
+
+       in = decode_pfor_bitmap_exceptions(in, BlockSize, out);
+
+       // Decode the base values.
+       InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width);
+       InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width);
+       InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width);
+       InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width);
+       for (unsigned i = 0; i < BlockSize / 4; ++i) {
+               out[i * 4 + 0] = bs0.read() | (out[i * 4 + 0] << bit_width);
+               out[i * 4 + 1] = bs1.read() | (out[i * 4 + 1] << bit_width);
+               out[i * 4 + 2] = bs2.read() | (out[i * 4 + 2] << bit_width);
+               out[i * 4 + 3] = bs3.read() | (out[i * 4 + 3] << bit_width);
+       }
+
+       // Delta-decode.
+       Docid prev_val = out[-1];
+       for (unsigned i = 0; i < BlockSize; ++i) {
+               out[i] = prev_val = out[i] + prev_val + 1;
+       }
+       return in + bytes_for_packed_bits(BlockSize, bit_width);
+}
+
+// Can read 16 bytes past the end of the input (inherit from decode_pfor_bitmap_interleaved_generic()).
+template<unsigned BlockSize, class Docid>
+const unsigned char *decode_pfor_bitmap_interleaved(const unsigned char *in, Docid *out)
+{
+       if constexpr (BlockSize == 128 && sizeof(Docid) == sizeof(uint32_t)) {
+               return decode_pfor_bitmap_interleaved_128_32(in, out);
+       } else {
+               return decode_pfor_bitmap_interleaved_generic(in, out);
+       }
+}
+
+__attribute__((target("default")))
+const unsigned char *
+decode_pfor_bitmap_interleaved_128_32(const unsigned char *in, uint32_t *out)
+{
+       return decode_pfor_bitmap_interleaved_generic<128>(in, out);
+}
+
+#ifdef COULD_HAVE_SSE2
+// Specialized version for SSE2.
+//
+// Can read 16 bytes past the end of the input (inherit from InterleavedBitReaderSSE2
+// and decode_pfor_bitmap_exceptions()).
+__attribute__((target("sse2")))
+const unsigned char *
+decode_pfor_bitmap_interleaved_128_32(const unsigned char *in, uint32_t *out)
+{
+       constexpr unsigned BlockSize = 128;
+
+// Set all output values to zero, before the exceptions are filled in.
+#pragma GCC unroll 4
+       for (unsigned i = 0; i < BlockSize / 4; ++i) {
+               _mm_storeu_si128(reinterpret_cast<__m128i *>(out) + i, _mm_setzero_si128());
+       }
+
+       const unsigned bit_width = *in++ & 0x3f;
+
+       in = decode_pfor_bitmap_exceptions(in, BlockSize, out);
+       in = decode_bitmap_sse2<BlockSize, /*OrWithExisting=*/true, /*DeltaDecode=*/true>(in, bit_width, out);
+
+       return in;
+}
+#endif
+
+// PFor block with variable-byte exceptions. Layout:
+//
+//  - Bit width (6 bits) | type << 6
+//  - Number of exceptions (8 bits)
+//  - Base values (<num> values of <bits> bits, rounded up to a byte)
+//  - Exceptions:
+//    - If first byte is 255, <num_exc> 32-bit values (does not include the 255 byte)
+//    - Else, <num_exc> varbyte-encoded values (includes the non-255 byte)
+//  - Indexes of exceptions (<num_exc> bytes).
+//
+// Can read 4 bytes past the end of the input (inherit from BitReader,
+// assuming zero exceptions).
+template<unsigned BlockSize, class Docid>
+const unsigned char *decode_pfor_vb(const unsigned char *in, unsigned num, Docid *out)
+{
+       //fprintf(stderr, "in=%p out=%p num=%u\n", in, out, num);
+
+       const unsigned bit_width = *in++ & 0x3f;
+       unsigned num_exceptions = *in++;
+
+       // Decode the base values.
+       BitReader bs(in, bit_width);
+       for (unsigned i = 0; i < num; ++i) {
+               out[i] = bs.read();
+       }
+       in += bytes_for_packed_bits(num, bit_width);
+
+       // Decode exceptions.
+       Docid exceptions[BlockSize];
+       if (*in == 255) {
+               ++in;
+               for (unsigned i = 0; i < num_exceptions; ++i) {
+                       exceptions[i] = read_le<Docid>(in);
+                       in += sizeof(Docid);
+               }
+       } else {
+               for (unsigned i = 0; i < num_exceptions; ++i) {
+                       in = read_vb(in, &exceptions[i]);
+               }
+       }
+       // Apply exceptions.
+       for (unsigned i = 0; i < num_exceptions; ++i) {
+               unsigned idx = *in++;
+               out[idx] |= exceptions[i] << bit_width;
+       }
+
+       // Delta-decode.
+       Docid prev_val = out[-1];
+       for (unsigned i = 0; i < num; ++i) {
+               out[i] = prev_val = out[i] + prev_val + 1;
+       }
+
+       return in;
+}
+
+// Like decode_pfor_vb(), but the base values are organized in four
+// independent streams, for SIMD (presumably SSE2). Supports a whole block only.
+// Can read 16 bytes past the end of its input (inherit from InterleavedBitReader).
+template<unsigned BlockSize, class Docid>
+const unsigned char *decode_pfor_vb_interleaved_generic(const unsigned char *in, Docid *out)
+{
+       const unsigned bit_width = *in++ & 0x3f;
+       unsigned num_exceptions = *in++;
+
+       // Decode the base values.
+       InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width);
+       InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width);
+       InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width);
+       InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width);
+       for (unsigned i = 0; i < BlockSize / 4; ++i) {
+               out[i * 4 + 0] = bs0.read();
+               out[i * 4 + 1] = bs1.read();
+               out[i * 4 + 2] = bs2.read();
+               out[i * 4 + 3] = bs3.read();
+       }
+       in += bytes_for_packed_bits(BlockSize, bit_width);
+
+       // Decode exceptions.
+       Docid exceptions[BlockSize];
+       if (*in == 255) {
+               ++in;
+               for (unsigned i = 0; i < num_exceptions; ++i) {
+                       exceptions[i] = read_le<Docid>(in);
+                       in += sizeof(Docid);
+               }
+       } else {
+               for (unsigned i = 0; i < num_exceptions; ++i) {
+                       in = read_vb(in, &exceptions[i]);
+               }
+       }
+
+       // Apply exceptions.
+       for (unsigned i = 0; i < num_exceptions; ++i) {
+               unsigned idx = *in++;
+               out[idx] |= exceptions[i] << bit_width;
+       }
+
+       // Delta-decode.
+       Docid prev_val = out[-1];
+       for (unsigned i = 0; i < BlockSize; ++i) {
+               out[i] = prev_val = out[i] + prev_val + 1;
+       }
+
+       return in;
+}
+
+// Can read 16 bytes past the end of its input (inherit from decode_pfor_vb_interleaved_generic()).
+template<unsigned BlockSize, class Docid>
+const unsigned char *decode_pfor_vb_interleaved(const unsigned char *in, Docid *out)
+{
+       if constexpr (BlockSize == 128 && sizeof(Docid) == sizeof(uint32_t)) {
+               return decode_pfor_vb_interleaved_128_32(in, out);
+       } else {
+               return decode_pfor_vb_interleaved_generic(in, out);
+       }
+}
+
+__attribute__((target("default")))
+const unsigned char *
+decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out)
+{
+       return decode_pfor_vb_interleaved_generic<128>(in, out);
+}
+
+// Specialized version for SSE2.
+// Can read 16 bytes past the end of the input (inherit from decode_bitmap_sse2()).
+__attribute__((target("sse2")))
+const unsigned char *
+decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out)
+{
+       constexpr unsigned BlockSize = 128;
+       using Docid = uint32_t;
+
+       const unsigned bit_width = *in++ & 0x3f;
+       unsigned num_exceptions = *in++;
+
+       // Decode the base values.
+       in = decode_bitmap_sse2<BlockSize, /*OrWithExisting=*/false, /*DeltaDecode=*/false>(in, bit_width, out);
+
+       // Decode exceptions.
+       Docid exceptions[BlockSize];
+       if (*in == 255) {
+               ++in;
+               for (unsigned i = 0; i < num_exceptions; ++i) {
+                       exceptions[i] = read_le<Docid>(in);
+                       in += sizeof(Docid);
+               }
+       } else {
+               for (unsigned i = 0; i < num_exceptions; ++i) {
+                       in = read_vb(in, &exceptions[i]);
+               }
+       }
+
+       // Apply exceptions.
+       for (unsigned i = 0; i < num_exceptions; ++i) {
+               unsigned idx = *in++;
+               out[idx] |= exceptions[i] << bit_width;
+       }
+
+       delta_decode_sse2<BlockSize>(out);
+
+       return in;
+}
+
+// Can read 16 bytes past the end of the input (inherit from several functions).
+template<unsigned BlockSize, class Docid>
+const unsigned char *decode_pfor_delta1(const unsigned char *in, unsigned num, bool interleaved, Docid *out)
+{
+       if (num == 0) {
+               return in;
+       }
+       in = read_baseval(in, out++);
+
+       for (unsigned i = 1; i < num; i += BlockSize, out += BlockSize) {
+               const unsigned num_this_block = std::min<unsigned>(num - i, BlockSize);
+               switch (in[0] >> 6) {
+               case BlockType::FOR:
+                       if (interleaved && num_this_block == BlockSize) {
+                               dprintf("%d+%d: blocktype=%d (for, interleaved), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
+                               in = decode_for_interleaved<BlockSize>(in, out);
+                       } else {
+                               dprintf("%d+%d: blocktype=%d (for), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
+                               in = decode_for(in, num_this_block, out);
+                       }
+                       break;
+               case BlockType::PFOR_VB:
+                       if (interleaved && num_this_block == BlockSize) {
+                               dprintf("%d+%d: blocktype=%d (pfor + vb, interleaved), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
+                               in = decode_pfor_vb_interleaved<BlockSize>(in, out);
+                       } else {
+                               dprintf("%d+%d: blocktype=%d (pfor + vb), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
+                               in = decode_pfor_vb<BlockSize>(in, num_this_block, out);
+                       }
+                       break;
+               case BlockType::PFOR_BITMAP:
+                       if (interleaved && num_this_block == BlockSize) {
+                               dprintf("%d+%d: blocktype=%d (pfor + bitmap, interleaved), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
+                               in = decode_pfor_bitmap_interleaved<BlockSize>(in, out);
+                       } else {
+                               dprintf("%d+%d: blocktype=%d (pfor + bitmap), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
+                               in = decode_pfor_bitmap(in, num_this_block, out);
+                       }
+                       break;
+               case BlockType::CONSTANT:
+                       dprintf("%d+%d: blocktype=%d (constant), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
+                       in = decode_constant(in, num_this_block, out);
+                       break;
+               }
+       }
+
+       return in;
+}
+
+const unsigned char *decode_pfor_delta1_128(const unsigned char *in, unsigned num, bool interleaved, uint32_t *out)
+{
+       return decode_pfor_delta1<128>(in, num, interleaved, out);
+}
index 0146d568ee507daea9845958c07728c2dfd5affb..5fef0a0ceb33c679ce162b4f3f2f8b3277ec8b79 100644 (file)
 // in the input buffers; this is documented for each function (unlike
 // the reference implementation), but the documented slop assumes a
 // non-malicious encoder.
-
-#include <algorithm>
-#include <assert.h>
-#include <endian.h>
-#include <limits.h>
-#include <stdint.h>
-#include <string.h>
-
-#if defined(__i386__) || defined(__x86_64__)
-#define COULD_HAVE_SSE2
-#include <immintrin.h>
-#endif
-
-#include "turbopfor-common.h"
-
-// Forward declarations to declare to the template code below that they exist.
-// (These must seemingly be non-templates for function multiversioning to work.)
-__attribute__((target("default")))
-const unsigned char *
-decode_for_interleaved_128_32(const unsigned char *in, uint32_t *out);
-__attribute__((target("default")))
-const unsigned char *
-decode_pfor_bitmap_interleaved_128_32(const unsigned char *in, uint32_t *out);
-__attribute__((target("default")))
-const unsigned char *
-decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out);
-
-#ifdef COULD_HAVE_SSE2
-__attribute__((target("sse2")))
-const unsigned char *
-decode_for_interleaved_128_32(const unsigned char *in, uint32_t *out);
-__attribute__((target("sse2")))
-const unsigned char *
-decode_pfor_bitmap_interleaved_128_32(const unsigned char *in, uint32_t *out);
-__attribute__((target("sse2")))
-const unsigned char *
-decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out);
-#endif
-
-template<class Docid>
-Docid read_le(const void *in)
-{
-       Docid val;
-       memcpy(&val, in, sizeof(val));
-       if constexpr (sizeof(Docid) == 8) {
-               return le64toh(val);
-       } else if constexpr (sizeof(Docid) == 4) {
-               return le32toh(val);
-       } else if constexpr (sizeof(Docid) == 2) {
-               return le16toh(val);
-       } else if constexpr (sizeof(Docid) == 1) {
-               return val;
-       } else {
-               assert(false);
-       }
-}
-
-// Reads a single value with an encoding that looks a bit like PrefixVarint.
-// It's unclear why this doesn't use the varbyte encoding.
-template<class Docid>
-const unsigned char *read_baseval(const unsigned char *in, Docid *out)
-{
-       //fprintf(stderr, "baseval: 0x%02x 0x%02x 0x%02x 0x%02x\n", in[0], in[1], in[2], in[3]);
-       if (*in < 128) {
-               *out = *in;
-               return in + 1;
-       } else if (*in < 192) {
-               *out = ((uint32_t(in[0]) << 8) | uint32_t(in[1])) & 0x3fff;
-               return in + 2;
-       } else if (*in < 224) {
-               *out = ((uint32_t(in[0]) << 16) |
-                       (uint32_t(in[2]) << 8) |
-                       (uint32_t(in[1]))) & 0x1fffff;
-               return in + 3;
-       } else {
-               assert(false);  // Not implemented.
-       }
-}
-
-// Does not read past the end of the input.
-template<class Docid>
-const unsigned char *read_vb(const unsigned char *in, Docid *out)
-{
-       if (*in <= 176) {
-               *out = *in;
-               return in + 1;
-       } else if (*in <= 240) {
-               *out = ((uint32_t(in[0] - 177) << 8) | uint32_t(in[1])) + 177;
-               return in + 2;
-       } else if (*in <= 248) {
-               *out = ((uint32_t(in[0] - 241) << 16) | read_le<uint16_t>(in + 1)) + 16561;
-               return in + 3;
-       } else if (*in == 249) {
-               *out = (uint32_t(in[1])) |
-                       (uint32_t(in[2]) << 8) |
-                       (uint32_t(in[3]) << 16);
-               return in + 4;
-       } else if (*in == 250) {
-               *out = read_le<uint32_t>(in + 1);
-               return in + 5;
-       } else {
-               assert(false);
-       }
-}
-
-struct BitReader {
-public:
-       BitReader(const unsigned char *in, unsigned bits)
-               : in(in), bits(bits), mask(mask_for_bits(bits)) {}
-
-       // Can read 4 bytes past the end of the input (if bits_used == 0).
-       uint32_t read()
-       {
-               uint32_t val = (read_le<uint32_t>(in) >> bits_used) & mask;
-
-               bits_used += bits;
-               in += bits_used / 8;
-               bits_used %= 8;
-
-               return val;
-       }
-
-private:
-       const unsigned char *in;
-       const unsigned bits;
-       const unsigned mask;
-       unsigned bits_used = 0;
-};
-
-template<unsigned NumStreams>
-struct InterleavedBitReader {
-public:
-       InterleavedBitReader(const unsigned char *in, unsigned bits)
-               : in(in), bits(bits), mask(mask_for_bits(bits)) {}
-
-       // Can read 4 bytes past the end of the input (if bit_width == 0).
-       uint32_t read()
-       {
-               uint32_t val;
-               if (bits_used + bits > 32) {
-                       val = (read_le<uint32_t>(in) >> bits_used) | (read_le<uint32_t>(in + Stride) << (32 - bits_used));
-               } else {
-                       val = (read_le<uint32_t>(in) >> bits_used);
-               }
-
-               bits_used += bits;
-               in += Stride * (bits_used / 32);
-               bits_used %= 32;
-
-               return val & mask;
-       }
-
-private:
-       static constexpr unsigned Stride = NumStreams * sizeof(uint32_t);
-       const unsigned char *in;
-       const unsigned bits;
-       const unsigned mask;
-       unsigned bits_used = 0;
-};
-
-#ifdef COULD_HAVE_SSE2
-struct InterleavedBitReaderSSE2 {
-public:
-       __attribute__((target("sse2")))
-       InterleavedBitReaderSSE2(const unsigned char *in, unsigned bits)
-               : in(reinterpret_cast<const __m128i *>(in)), bits(bits), mask(_mm_set1_epi32(mask_for_bits(bits))) {}
-
-       // Can read 16 bytes past the end of the input (if bit_width == 0).
-       __attribute__((target("sse2")))
-       __m128i
-       read()
-       {
-               __m128i val = _mm_srli_epi32(_mm_loadu_si128(in), bits_used);
-               if (bits_used + bits > 32) {
-                       __m128i val_upper = _mm_slli_epi32(_mm_loadu_si128(in + 1), 32 - bits_used);
-                       val = _mm_or_si128(val, val_upper);
-               }
-               val = _mm_and_si128(val, mask);
-
-               bits_used += bits;
-               in += bits_used / 32;
-               bits_used %= 32;
-               return val;
-       }
-
-private:
-       const __m128i *in;
-       const unsigned bits;
-       const __m128i mask;
-       unsigned bits_used = 0;
-};
-#endif
-
-// Constant block. Layout:
-//
-//  - Bit width (6 bits) | type << 6
-//  - Base values (<bits> bits, rounded up to nearest byte)
 //
-// Can read 4 bytes past the end of the input (if bit_width == 0).
-template<class Docid>
-const unsigned char *decode_constant(const unsigned char *in, unsigned num, Docid *out)
-{
-       const unsigned bit_width = *in++ & 0x3f;
-       Docid val = read_le<Docid>(in);
-       if (bit_width < sizeof(Docid) * 8) {
-               val &= mask_for_bits(bit_width);
-       }
-
-       Docid prev_val = out[-1];
-       for (unsigned i = 0; i < num; ++i) {
-               out[i] = prev_val = val + prev_val + 1;
-       }
-       return in + div_round_up(bit_width, 8);
-}
-
-// FOR block (ie., PFor without exceptions). Layout:
-//
-//  - Bit width (6 bits) | type << 6
-//  - Base values (<num> values of <bits> bits, rounded up to a multiple of 32 values)
-//
-// Can read 4 bytes past the end of the input (inherit from BitReader).
-template<class Docid>
-const unsigned char *decode_for(const unsigned char *in, unsigned num, Docid *out)
-{
-       const unsigned bit_width = *in++ & 0x3f;
-
-       Docid prev_val = out[-1];
-       BitReader bs(in, bit_width);
-       for (unsigned i = 0; i < num; ++i) {
-               prev_val = out[i] = bs.read() + prev_val + 1;
-       }
-       return in + bytes_for_packed_bits(num, bit_width);
-}
-
-#ifdef COULD_HAVE_SSE2
-class DeltaDecoderSSE2 {
-public:
-       __attribute__((target("sse2")))
-       DeltaDecoderSSE2(uint32_t prev_val)
-               : prev_val(_mm_set1_epi32(prev_val)) {}
-
-       __attribute__((target("sse2")))
-       __m128i
-       decode(__m128i val)
-       {
-               val = _mm_add_epi32(val, _mm_slli_si128(val, 4));
-               val = _mm_add_epi32(val, _mm_slli_si128(val, 8));
-               val = _mm_add_epi32(val, _mm_add_epi32(prev_val, delta));
-               prev_val = _mm_shuffle_epi32(val, _MM_SHUFFLE(3, 3, 3, 3));
-               return val;
-       }
-
-private:
-       // Use 4/3/2/1 as delta instead of fixed 1, so that we can do the prev_val + delta
-       // in parallel with something else.
-       const __m128i delta = _mm_set_epi32(4, 3, 2, 1);
-
-       __m128i prev_val;
-};
-
-template<unsigned BlockSize>
-__attribute__((target("sse2"))) inline void delta_decode_sse2(uint32_t *out)
-{
-       DeltaDecoderSSE2 delta(out[-1]);
-       __m128i *outvec = reinterpret_cast<__m128i *>(out);
-       for (unsigned i = 0; i < BlockSize / 4; ++i) {
-               __m128i val = _mm_loadu_si128(outvec + i);
-               _mm_storeu_si128(outvec + i, delta.decode(val));
-       }
-}
-
-// Can read 16 bytes past the end of its input (inherit from InterleavedBitReaderSSE2).
-template<unsigned BlockSize, bool OrWithExisting, bool DeltaDecode, unsigned bit_width>
-__attribute__((target("sse2")))
-const unsigned char *
-decode_bitmap_sse2_unrolled(const unsigned char *in, uint32_t *out)
-{
-       __m128i *outvec = reinterpret_cast<__m128i *>(out);
-       DeltaDecoderSSE2 delta(out[-1]);
-       InterleavedBitReaderSSE2 bs(in, bit_width);
-#pragma GCC unroll 32
-       for (unsigned i = 0; i < BlockSize / 4; ++i) {
-               __m128i val = bs.read();
-               if constexpr (OrWithExisting) {
-                       val = _mm_or_si128(val, _mm_slli_epi32(_mm_loadu_si128(outvec + i), bit_width));
-               }
-               if constexpr (DeltaDecode) {
-                       val = delta.decode(val);
-               }
-               _mm_storeu_si128(outvec + i, val);
-       }
-       in += bytes_for_packed_bits(BlockSize, bit_width);
-       return in;
-}
-
-// Can read 16 bytes past the end of its input (inherit from InterleavedBitReaderSSE2).
-template<unsigned BlockSize, bool OrWithExisting, bool DeltaDecode>
-__attribute__((target("sse2")))
-const unsigned char *
-decode_bitmap_sse2(const unsigned char *in, unsigned bit_width, uint32_t *out)
-{
-       switch (bit_width) {
-       case 0:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 0>(in, out);
-       case 1:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 1>(in, out);
-       case 2:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 2>(in, out);
-       case 3:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 3>(in, out);
-       case 4:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 4>(in, out);
-       case 5:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 5>(in, out);
-       case 6:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 6>(in, out);
-       case 7:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 7>(in, out);
-       case 8:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 8>(in, out);
-       case 9:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 9>(in, out);
-       case 10:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 10>(in, out);
-       case 11:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 11>(in, out);
-       case 12:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 12>(in, out);
-       case 13:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 13>(in, out);
-       case 14:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 14>(in, out);
-       case 15:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 15>(in, out);
-       case 16:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 16>(in, out);
-       case 17:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 17>(in, out);
-       case 18:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 18>(in, out);
-       case 19:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 19>(in, out);
-       case 20:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 20>(in, out);
-       case 21:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 21>(in, out);
-       case 22:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 22>(in, out);
-       case 23:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 23>(in, out);
-       case 24:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 24>(in, out);
-       case 25:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 25>(in, out);
-       case 26:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 26>(in, out);
-       case 27:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 27>(in, out);
-       case 28:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 28>(in, out);
-       case 29:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 29>(in, out);
-       case 30:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 30>(in, out);
-       case 31:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 31>(in, out);
-       case 32:
-               return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 32>(in, out);
-       }
-       assert(false);
-}
-#endif
-
-// Like decode_for(), but the values are organized in four independent streams,
-// for SIMD (presumably SSE2). Supports a whole block only.
-//
-// Can read 16 bytes past the end of its input (inherit from InterleavedBitReader).
-template<unsigned BlockSize, class Docid>
-const unsigned char *decode_for_interleaved_generic(const unsigned char *in, Docid *out)
-{
-       const unsigned bit_width = *in++ & 0x3f;
-
-       InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width);
-       InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width);
-       InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width);
-       InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width);
-       for (unsigned i = 0; i < BlockSize / 4; ++i) {
-               out[i * 4 + 0] = bs0.read();
-               out[i * 4 + 1] = bs1.read();
-               out[i * 4 + 2] = bs2.read();
-               out[i * 4 + 3] = bs3.read();
-       }
-       Docid prev_val = out[-1];
-       for (unsigned i = 0; i < BlockSize; ++i) {
-               out[i] = prev_val = out[i] + prev_val + 1;
-       }
-       return in + bytes_for_packed_bits(BlockSize, bit_width);
-}
-
-// Does not read past the end of the input.
-template<unsigned BlockSize, class Docid>
-const unsigned char *decode_for_interleaved(const unsigned char *in, Docid *out)
-{
-       if constexpr (BlockSize == 128 && sizeof(Docid) == sizeof(uint32_t)) {
-               return decode_for_interleaved_128_32(in, out);
-       } else {
-               return decode_for_interleaved_generic(in, out);
-       }
-}
-
-// Does not read past the end of the input.
-__attribute__((target("default")))
-const unsigned char *
-decode_for_interleaved_128_32(const unsigned char *in, uint32_t *out)
-{
-       return decode_for_interleaved_generic<128>(in, out);
-}
-
-#ifdef COULD_HAVE_SSE2
-// Specialized version for SSE2.
-// Can read 16 bytes past the end of the input (inherit from decode_bitmap_sse2()).
-__attribute__((target("sse2")))
-const unsigned char *
-decode_for_interleaved_128_32(const unsigned char *in, uint32_t *out)
-{
-       constexpr unsigned BlockSize = 128;
-
-       const unsigned bit_width = *in++ & 0x3f;
-
-       in = decode_bitmap_sse2<BlockSize, /*OrWithExisting=*/false, /*DeltaDecode=*/true>(in, bit_width, out);
-
-       return in;
-}
-#endif
-
-// Can read 4 bytes past the end of the input (inherit from BitReader).
-template<class Docid>
-const unsigned char *decode_pfor_bitmap_exceptions(const unsigned char *in, unsigned num, Docid *out)
-{
-       const unsigned exception_bit_width = *in++;
-       const uint64_t *exception_bitmap_ptr = reinterpret_cast<const uint64_t *>(in);
-       in += div_round_up(num, 8);
-
-       int num_exceptions = 0;
-
-       BitReader bs(in, exception_bit_width);
-       for (unsigned i = 0; i < num; i += 64, ++exception_bitmap_ptr) {
-               uint64_t exceptions = read_le<uint64_t>(exception_bitmap_ptr);
-               if (num - i < 64) {
-                       // We've read some bytes past the end, so clear out the junk bits.
-                       exceptions &= (1ULL << (num - i)) - 1;
-               }
-               for (; exceptions != 0; exceptions &= exceptions - 1, ++num_exceptions) {
-                       unsigned idx = (ffsll(exceptions) - 1) + i;
-                       out[idx] = bs.read();
-               }
-       }
-       in += bytes_for_packed_bits(num_exceptions, exception_bit_width);
-       return in;
-}
-
-// PFor block with bitmap exceptions. Layout:
-//
-//  - Bit width (6 bits) | type << 6
-//  - Exception bit width (8 bits)
-//  - Bitmap of which values have exceptions (<num> bits, rounded up to a byte)
-//  - Exceptions (<num_exc> values of <bits_exc> bits, rounded up to a byte)
-//  - Base values (<num> values of <bits> bits, rounded up to a byte)
-//
-// Can read 4 bytes past the end of the input (inherit from BitReader).
-template<class Docid>
-const unsigned char *decode_pfor_bitmap(const unsigned char *in, unsigned num, Docid *out)
-{
-       memset(out, 0, num * sizeof(Docid));
-
-       const unsigned bit_width = *in++ & 0x3f;
-
-       in = decode_pfor_bitmap_exceptions(in, num, out);
-
-       // Decode the base values, and delta-decode.
-       Docid prev_val = out[-1];
-       BitReader bs(in, bit_width);
-       for (unsigned i = 0; i < num; ++i) {
-               out[i] = prev_val = ((out[i] << bit_width) | bs.read()) + prev_val + 1;
-       }
-       return in + bytes_for_packed_bits(num, bit_width);
-}
-
-// Like decode_pfor_bitmap(), but the base values are organized in four
-// independent streams, for SIMD (presumably SSE2). Supports a whole block only.
-//
-// Can read 16 bytes past the end of the input (inherit from InterleavedBitReader
-// and decode_pfor_bitmap_exceptions()).
-template<unsigned BlockSize, class Docid>
-const unsigned char *decode_pfor_bitmap_interleaved_generic(const unsigned char *in, Docid *out)
-{
-       memset(out, 0, BlockSize * sizeof(Docid));
-
-       const unsigned bit_width = *in++ & 0x3f;
-
-       in = decode_pfor_bitmap_exceptions(in, BlockSize, out);
-
-       // Decode the base values.
-       InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width);
-       InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width);
-       InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width);
-       InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width);
-       for (unsigned i = 0; i < BlockSize / 4; ++i) {
-               out[i * 4 + 0] = bs0.read() | (out[i * 4 + 0] << bit_width);
-               out[i * 4 + 1] = bs1.read() | (out[i * 4 + 1] << bit_width);
-               out[i * 4 + 2] = bs2.read() | (out[i * 4 + 2] << bit_width);
-               out[i * 4 + 3] = bs3.read() | (out[i * 4 + 3] << bit_width);
-       }
-
-       // Delta-decode.
-       Docid prev_val = out[-1];
-       for (unsigned i = 0; i < BlockSize; ++i) {
-               out[i] = prev_val = out[i] + prev_val + 1;
-       }
-       return in + bytes_for_packed_bits(BlockSize, bit_width);
-}
-
-// Can read 16 bytes past the end of the input (inherit from decode_pfor_bitmap_interleaved_generic()).
-template<unsigned BlockSize, class Docid>
-const unsigned char *decode_pfor_bitmap_interleaved(const unsigned char *in, Docid *out)
-{
-       if constexpr (BlockSize == 128 && sizeof(Docid) == sizeof(uint32_t)) {
-               return decode_pfor_bitmap_interleaved_128_32(in, out);
-       } else {
-               return decode_pfor_bitmap_interleaved_generic(in, out);
-       }
-}
-
-__attribute__((target("default")))
-const unsigned char *
-decode_pfor_bitmap_interleaved_128_32(const unsigned char *in, uint32_t *out)
-{
-       return decode_pfor_bitmap_interleaved_generic<128>(in, out);
-}
-
-#ifdef COULD_HAVE_SSE2
-// Specialized version for SSE2.
-//
-// Can read 16 bytes past the end of the input (inherit from InterleavedBitReaderSSE2
-// and decode_pfor_bitmap_exceptions()).
-__attribute__((target("sse2")))
-const unsigned char *
-decode_pfor_bitmap_interleaved_128_32(const unsigned char *in, uint32_t *out)
-{
-       constexpr unsigned BlockSize = 128;
-
-// Set all output values to zero, before the exceptions are filled in.
-#pragma GCC unroll 4
-       for (unsigned i = 0; i < BlockSize / 4; ++i) {
-               _mm_storeu_si128(reinterpret_cast<__m128i *>(out) + i, _mm_setzero_si128());
-       }
-
-       const unsigned bit_width = *in++ & 0x3f;
-
-       in = decode_pfor_bitmap_exceptions(in, BlockSize, out);
-       in = decode_bitmap_sse2<BlockSize, /*OrWithExisting=*/true, /*DeltaDecode=*/true>(in, bit_width, out);
-
-       return in;
-}
-#endif
-
-// PFor block with variable-byte exceptions. Layout:
-//
-//  - Bit width (6 bits) | type << 6
-//  - Number of exceptions (8 bits)
-//  - Base values (<num> values of <bits> bits, rounded up to a byte)
-//  - Exceptions:
-//    - If first byte is 255, <num_exc> 32-bit values (does not include the 255 byte)
-//    - Else, <num_exc> varbyte-encoded values (includes the non-255 byte)
-//  - Indexes of exceptions (<num_exc> bytes).
-//
-// Can read 4 bytes past the end of the input (inherit from BitReader,
-// assuming zero exceptions).
-template<unsigned BlockSize, class Docid>
-const unsigned char *decode_pfor_vb(const unsigned char *in, unsigned num, Docid *out)
-{
-       //fprintf(stderr, "in=%p out=%p num=%u\n", in, out, num);
-
-       const unsigned bit_width = *in++ & 0x3f;
-       unsigned num_exceptions = *in++;
-
-       // Decode the base values.
-       BitReader bs(in, bit_width);
-       for (unsigned i = 0; i < num; ++i) {
-               out[i] = bs.read();
-       }
-       in += bytes_for_packed_bits(num, bit_width);
-
-       // Decode exceptions.
-       Docid exceptions[BlockSize];
-       if (*in == 255) {
-               ++in;
-               for (unsigned i = 0; i < num_exceptions; ++i) {
-                       exceptions[i] = read_le<Docid>(in);
-                       in += sizeof(Docid);
-               }
-       } else {
-               for (unsigned i = 0; i < num_exceptions; ++i) {
-                       in = read_vb(in, &exceptions[i]);
-               }
-       }
-       // Apply exceptions.
-       for (unsigned i = 0; i < num_exceptions; ++i) {
-               unsigned idx = *in++;
-               out[idx] |= exceptions[i] << bit_width;
-       }
-
-       // Delta-decode.
-       Docid prev_val = out[-1];
-       for (unsigned i = 0; i < num; ++i) {
-               out[i] = prev_val = out[i] + prev_val + 1;
-       }
-
-       return in;
-}
-
-// Like decode_pfor_vb(), but the base values are organized in four
-// independent streams, for SIMD (presumably SSE2). Supports a whole block only.
-// Can read 16 bytes past the end of its input (inherit from InterleavedBitReader).
-template<unsigned BlockSize, class Docid>
-const unsigned char *decode_pfor_vb_interleaved_generic(const unsigned char *in, Docid *out)
-{
-       const unsigned bit_width = *in++ & 0x3f;
-       unsigned num_exceptions = *in++;
-
-       // Decode the base values.
-       InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width);
-       InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width);
-       InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width);
-       InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width);
-       for (unsigned i = 0; i < BlockSize / 4; ++i) {
-               out[i * 4 + 0] = bs0.read();
-               out[i * 4 + 1] = bs1.read();
-               out[i * 4 + 2] = bs2.read();
-               out[i * 4 + 3] = bs3.read();
-       }
-       in += bytes_for_packed_bits(BlockSize, bit_width);
-
-       // Decode exceptions.
-       Docid exceptions[BlockSize];
-       if (*in == 255) {
-               ++in;
-               for (unsigned i = 0; i < num_exceptions; ++i) {
-                       exceptions[i] = read_le<Docid>(in);
-                       in += sizeof(Docid);
-               }
-       } else {
-               for (unsigned i = 0; i < num_exceptions; ++i) {
-                       in = read_vb(in, &exceptions[i]);
-               }
-       }
-
-       // Apply exceptions.
-       for (unsigned i = 0; i < num_exceptions; ++i) {
-               unsigned idx = *in++;
-               out[idx] |= exceptions[i] << bit_width;
-       }
-
-       // Delta-decode.
-       Docid prev_val = out[-1];
-       for (unsigned i = 0; i < BlockSize; ++i) {
-               out[i] = prev_val = out[i] + prev_val + 1;
-       }
-
-       return in;
-}
-
-// Can read 16 bytes past the end of its input (inherit from decode_pfor_vb_interleaved_generic()).
-template<unsigned BlockSize, class Docid>
-const unsigned char *decode_pfor_vb_interleaved(const unsigned char *in, Docid *out)
-{
-       if constexpr (BlockSize == 128 && sizeof(Docid) == sizeof(uint32_t)) {
-               return decode_pfor_vb_interleaved_128_32(in, out);
-       } else {
-               return decode_pfor_vb_interleaved_generic(in, out);
-       }
-}
-
-__attribute__((target("default")))
-const unsigned char *
-decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out)
-{
-       return decode_pfor_vb_interleaved_generic<128>(in, out);
-}
-
-// Specialized version for SSE2.
-// Can read 16 bytes past the end of the input (inherit from decode_bitmap_sse2()).
-__attribute__((target("sse2")))
-const unsigned char *
-decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out)
-{
-       constexpr unsigned BlockSize = 128;
-       using Docid = uint32_t;
-
-       const unsigned bit_width = *in++ & 0x3f;
-       unsigned num_exceptions = *in++;
-
-       // Decode the base values.
-       in = decode_bitmap_sse2<BlockSize, /*OrWithExisting=*/false, /*DeltaDecode=*/false>(in, bit_width, out);
-
-       // Decode exceptions.
-       Docid exceptions[BlockSize];
-       if (*in == 255) {
-               ++in;
-               for (unsigned i = 0; i < num_exceptions; ++i) {
-                       exceptions[i] = read_le<Docid>(in);
-                       in += sizeof(Docid);
-               }
-       } else {
-               for (unsigned i = 0; i < num_exceptions; ++i) {
-                       in = read_vb(in, &exceptions[i]);
-               }
-       }
-
-       // Apply exceptions.
-       for (unsigned i = 0; i < num_exceptions; ++i) {
-               unsigned idx = *in++;
-               out[idx] |= exceptions[i] << bit_width;
-       }
-
-       delta_decode_sse2<BlockSize>(out);
-
-       return in;
-}
-
-// Can read 16 bytes past the end of the input (inherit from several functions).
-template<unsigned BlockSize, class Docid>
-const unsigned char *decode_pfor_delta1(const unsigned char *in, unsigned num, bool interleaved, Docid *out)
-{
-       if (num == 0) {
-               return in;
-       }
-       in = read_baseval(in, out++);
-
-       for (unsigned i = 1; i < num; i += BlockSize, out += BlockSize) {
-               const unsigned num_this_block = std::min<unsigned>(num - i, BlockSize);
-               switch (in[0] >> 6) {
-               case BlockType::FOR:
-                       if (interleaved && num_this_block == BlockSize) {
-                               dprintf("%d+%d: blocktype=%d (for, interleaved), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
-                               in = decode_for_interleaved<BlockSize>(in, out);
-                       } else {
-                               dprintf("%d+%d: blocktype=%d (for), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
-                               in = decode_for(in, num_this_block, out);
-                       }
-                       break;
-               case BlockType::PFOR_VB:
-                       if (interleaved && num_this_block == BlockSize) {
-                               dprintf("%d+%d: blocktype=%d (pfor + vb, interleaved), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
-                               in = decode_pfor_vb_interleaved<BlockSize>(in, out);
-                       } else {
-                               dprintf("%d+%d: blocktype=%d (pfor + vb), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
-                               in = decode_pfor_vb<BlockSize>(in, num_this_block, out);
-                       }
-                       break;
-               case BlockType::PFOR_BITMAP:
-                       if (interleaved && num_this_block == BlockSize) {
-                               dprintf("%d+%d: blocktype=%d (pfor + bitmap, interleaved), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
-                               in = decode_pfor_bitmap_interleaved<BlockSize>(in, out);
-                       } else {
-                               dprintf("%d+%d: blocktype=%d (pfor + bitmap), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
-                               in = decode_pfor_bitmap(in, num_this_block, out);
-                       }
-                       break;
-               case BlockType::CONSTANT:
-                       dprintf("%d+%d: blocktype=%d (constant), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
-                       in = decode_constant(in, num_this_block, out);
-                       break;
-               }
-       }
+// Although all of these algorithms are templatized, we expose only
+// the single specialization that we need, in order to increase the
+// speed of incremental compilation.
 
-       return in;
-}
+const unsigned char *decode_pfor_delta1_128(const unsigned char *in, unsigned num, bool interleaved, uint32_t *out);
 
 #endif  // !defined(_TURBOPFOR_H)