+// Like decode_pfor_bitmap(), but the base values are organized in four
+// independent streams, for SIMD (presumably SSE2). Supports a whole block only.
+template<unsigned BlockSize, class Docid>
+const unsigned char *decode_pfor_bitmap_interleaved(const unsigned char *in, Docid *out)
+{
+ memset(out, 0, BlockSize * sizeof(Docid));
+
+ const unsigned bit_width = *in++ & 0x3f;
+ const unsigned exception_bit_width = *in++;
+
+ // Decode exceptions.
+ {
+ const uint64_t *exception_bitmap_ptr = reinterpret_cast<const uint64_t *>(in);
+ in += div_round_up(BlockSize, 8);
+
+ int num_exceptions = 0;
+
+ BitReader bs(in, exception_bit_width);
+ for (unsigned i = 0; i < BlockSize; i += 64, ++exception_bitmap_ptr) {
+ uint64_t exceptions = read_le<uint64_t>(exception_bitmap_ptr);
+ for (; exceptions != 0; exceptions &= exceptions - 1, ++num_exceptions) {
+ unsigned idx = (ffsll(exceptions) - 1) + i;
+ out[idx] = bs.read() << bit_width;
+ }
+ }
+ in += bytes_for_packed_bits(num_exceptions, exception_bit_width);
+ }
+
+ // Decode the base values, and delta-decode.
+ InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width);
+ InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width);
+ InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width);
+ InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width);
+ for (unsigned i = 0; i < BlockSize / 4; ++i) {
+ out[i * 4 + 0] |= bs0.read();
+ out[i * 4 + 1] |= bs1.read();
+ out[i * 4 + 2] |= bs2.read();
+ out[i * 4 + 3] |= bs3.read();
+ }
+ Docid *prev_out = out - 1;
+ for (unsigned i = 0; i < BlockSize; ++i) {
+ out[i] += prev_out[i] + 1;
+ }
+ return in + bytes_for_packed_bits(BlockSize, bit_width);
+}
+