4 // A reimplementation of parts of the TurboPFor codecs, using the same
5 // storage format. These are not as fast as the reference implementation
6 // (about 80% of the performance, averaged over a real plocate corpus),
7 // and do not support the same breadth of codecs (in particular, only
8 // delta-plus-1 is implemented, and only 32-bit docids are tested),
9 // but aim to be more portable and (ideally) easier-to-understand.
10 // In particular, they will compile on x86 without SSE4.1 or AVX support.
12 // The main reference is https://michael.stapelberg.ch/posts/2019-02-05-turbopfor-analysis/,
13 // although some implementation details have been worked out by studying the
23 #if defined(__i386__) || defined(__x86_64__)
24 #define COULD_HAVE_SSE2
25 #include <immintrin.h>
28 // Forward declarations to declare to the template code below that they exist.
29 // (These must seemingly be non-templates for function multiversioning to work.)
30 __attribute__((target("default")))
32 decode_for_interleaved_128_32(const unsigned char *in, uint32_t *out);
33 __attribute__((target("default")))
35 decode_pfor_bitmap_interleaved_128_32(const unsigned char *in, uint32_t *out);
36 __attribute__((target("default")))
38 decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out);
40 #ifdef COULD_HAVE_SSE2
41 __attribute__((target("sse2")))
43 decode_for_interleaved_128_32(const unsigned char *in, uint32_t *out);
44 __attribute__((target("sse2")))
46 decode_pfor_bitmap_interleaved_128_32(const unsigned char *in, uint32_t *out);
47 __attribute__((target("sse2")))
49 decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out);
52 constexpr uint32_t mask_for_bits(unsigned bit_width)
54 if (bit_width == 32) {
57 return (1U << bit_width) - 1;
62 Docid read_le(const void *in)
65 memcpy(&val, in, sizeof(val));
66 if constexpr (sizeof(Docid) == 8) {
68 } else if constexpr (sizeof(Docid) == 4) {
70 } else if constexpr (sizeof(Docid) == 2) {
72 } else if constexpr (sizeof(Docid) == 1) {
79 // Reads a single value with an encoding that looks a bit like PrefixVarint.
80 // It's unclear why this doesn't use the varbyte encoding.
82 const unsigned char *read_baseval(const unsigned char *in, Docid *out)
84 //fprintf(stderr, "baseval: 0x%02x 0x%02x 0x%02x 0x%02x\n", in[0], in[1], in[2], in[3]);
88 } else if (*in < 192) {
89 *out = ((uint32_t(in[0]) << 8) | uint32_t(in[1])) & 0x3fff;
91 } else if (*in < 224) {
92 *out = ((uint32_t(in[0]) << 16) |
93 (uint32_t(in[2]) << 8) |
94 (uint32_t(in[1]))) & 0x1fffff;
97 assert(false); // Not implemented.
101 template<class Docid>
102 const unsigned char *read_vb(const unsigned char *in, Docid *out)
107 } else if (*in <= 240) {
108 *out = ((uint32_t(in[0] - 177) << 8) | uint32_t(in[1])) + 177;
110 } else if (*in <= 248) {
111 *out = ((uint32_t(in[0] - 241) << 16) | read_le<uint16_t>(in + 1)) + 16561;
113 } else if (*in == 249) {
114 *out = (uint32_t(in[1])) |
115 (uint32_t(in[2]) << 8) |
116 (uint32_t(in[3]) << 16);
118 } else if (*in == 250) {
119 *out = read_le<uint32_t>(in + 1);
128 BitReader(const unsigned char *in, unsigned bits)
129 : in(in), bits(bits), mask(mask_for_bits(bits)) {}
132 uint32_t val = (read_le<uint32_t>(in) >> bits_used) & mask;
142 const unsigned char *in;
145 unsigned bits_used = 0;
148 template<unsigned NumStreams>
149 struct InterleavedBitReader {
151 InterleavedBitReader(const unsigned char *in, unsigned bits)
152 : in(in), bits(bits), mask(mask_for_bits(bits)) {}
156 if (bits_used + bits > 32) {
157 val = (read_le<uint32_t>(in) >> bits_used) | (read_le<uint32_t>(in + Stride) << (32 - bits_used));
159 val = (read_le<uint32_t>(in) >> bits_used);
163 in += Stride * (bits_used / 32);
170 static constexpr unsigned Stride = NumStreams * sizeof(uint32_t);
171 const unsigned char *in;
174 unsigned bits_used = 0;
177 #ifdef COULD_HAVE_SSE2
178 struct InterleavedBitReaderSSE2 {
180 __attribute__((target("sse2")))
181 InterleavedBitReaderSSE2(const unsigned char *in, unsigned bits)
182 : in(reinterpret_cast<const __m128i *>(in)), bits(bits), mask(_mm_set1_epi32(mask_for_bits(bits))) {}
184 __attribute__((target("sse2")))
188 __m128i val = _mm_srli_epi32(_mm_loadu_si128(in), bits_used);
189 if (bits_used + bits > 32) {
190 __m128i val_upper = _mm_slli_epi32(_mm_loadu_si128(in + 1), 32 - bits_used);
191 val = _mm_or_si128(val, val_upper);
193 val = _mm_and_si128(val, mask);
196 in += bits_used / 32;
205 unsigned bits_used = 0;
209 // Does not properly account for overflow.
210 inline unsigned div_round_up(unsigned val, unsigned div)
212 return (val + div - 1) / div;
215 inline unsigned bytes_for_packed_bits(unsigned num, unsigned bit_width)
217 return div_round_up(num * bit_width, CHAR_BIT);
220 // Constant block. Layout:
222 // - Bit width (6 bits) | type << 6
223 // - Base values (<bits> bits, rounded up to nearest byte)
224 template<class Docid>
225 const unsigned char *decode_constant(const unsigned char *in, unsigned num, Docid *out)
227 const unsigned bit_width = *in++ & 0x3f;
228 Docid val = read_le<Docid>(in);
229 if (bit_width < sizeof(Docid) * 8) {
230 val &= mask_for_bits(bit_width);
233 Docid prev_val = out[-1];
234 for (unsigned i = 0; i < num; ++i) {
235 out[i] = prev_val = val + prev_val + 1;
237 return in + div_round_up(bit_width, 8);
240 // FOR block (ie., PFor without exceptions). Layout:
242 // - Bit width (6 bits) | type << 6
243 // - Base values (<num> values of <bits> bits, rounded up to a multiple of 32 values)
244 template<class Docid>
245 const unsigned char *decode_for(const unsigned char *in, unsigned num, Docid *out)
247 const unsigned bit_width = *in++ & 0x3f;
249 Docid prev_val = out[-1];
250 BitReader bs(in, bit_width);
251 for (unsigned i = 0; i < num; ++i) {
252 prev_val = out[i] = bs.read() + prev_val + 1;
254 return in + bytes_for_packed_bits(num, bit_width);
257 #ifdef COULD_HAVE_SSE2
258 class DeltaDecoderSSE2 {
260 __attribute__((target("sse2")))
261 DeltaDecoderSSE2(uint32_t prev_val)
262 : prev_val(_mm_set1_epi32(prev_val)) {}
264 __attribute__((target("sse2")))
268 val = _mm_add_epi32(val, _mm_slli_si128(val, 4));
269 val = _mm_add_epi32(val, _mm_slli_si128(val, 8));
270 val = _mm_add_epi32(val, _mm_add_epi32(prev_val, delta));
271 prev_val = _mm_shuffle_epi32(val, _MM_SHUFFLE(3, 3, 3, 3));
276 // Use 4/3/2/1 as delta instead of fixed 1, so that we can do the prev_val + delta
277 // in parallel with something else.
278 const __m128i delta = _mm_set_epi32(4, 3, 2, 1);
283 template<unsigned BlockSize>
284 __attribute__((target("sse2"))) inline void delta_decode_sse2(uint32_t *out)
286 DeltaDecoderSSE2 delta(out[-1]);
287 __m128i *outvec = reinterpret_cast<__m128i *>(out);
288 for (unsigned i = 0; i < BlockSize / 4; ++i) {
289 __m128i val = _mm_loadu_si128(outvec + i);
290 _mm_storeu_si128(outvec + i, delta.decode(val));
294 template<unsigned BlockSize, bool OrWithExisting, bool DeltaDecode, unsigned bit_width>
295 __attribute__((target("sse2")))
296 const unsigned char *
297 decode_bitmap_sse2_unrolled(const unsigned char *in, uint32_t *out)
299 __m128i *outvec = reinterpret_cast<__m128i *>(out);
300 DeltaDecoderSSE2 delta(out[-1]);
301 InterleavedBitReaderSSE2 bs(in, bit_width);
302 #pragma GCC unroll 32
303 for (unsigned i = 0; i < BlockSize / 4; ++i) {
304 __m128i val = bs.read();
305 if constexpr (OrWithExisting) {
306 val = _mm_or_si128(val, _mm_slli_epi32(_mm_loadu_si128(outvec + i), bit_width));
308 if constexpr (DeltaDecode) {
309 val = delta.decode(val);
311 _mm_storeu_si128(outvec + i, val);
313 in += bytes_for_packed_bits(BlockSize, bit_width);
317 template<unsigned BlockSize, bool OrWithExisting, bool DeltaDecode>
318 __attribute__((target("sse2")))
319 const unsigned char *
320 decode_bitmap_sse2(const unsigned char *in, unsigned bit_width, uint32_t *out)
324 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 0>(in, out);
326 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 1>(in, out);
328 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 2>(in, out);
330 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 3>(in, out);
332 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 4>(in, out);
334 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 5>(in, out);
336 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 6>(in, out);
338 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 7>(in, out);
340 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 8>(in, out);
342 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 9>(in, out);
344 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 10>(in, out);
346 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 11>(in, out);
348 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 12>(in, out);
350 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 13>(in, out);
352 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 14>(in, out);
354 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 15>(in, out);
356 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 16>(in, out);
358 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 17>(in, out);
360 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 18>(in, out);
362 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 19>(in, out);
364 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 20>(in, out);
366 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 21>(in, out);
368 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 22>(in, out);
370 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 23>(in, out);
372 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 24>(in, out);
374 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 25>(in, out);
376 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 26>(in, out);
378 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 27>(in, out);
380 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 28>(in, out);
382 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 29>(in, out);
384 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 30>(in, out);
386 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 31>(in, out);
388 return decode_bitmap_sse2_unrolled<BlockSize, OrWithExisting, DeltaDecode, 32>(in, out);
394 // Like decode_for(), but the values are organized in four independent streams,
395 // for SIMD (presumably SSE2). Supports a whole block only.
396 template<unsigned BlockSize, class Docid>
397 const unsigned char *decode_for_interleaved_generic(const unsigned char *in, Docid *out)
399 const unsigned bit_width = *in++ & 0x3f;
401 InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width);
402 InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width);
403 InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width);
404 InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width);
405 for (unsigned i = 0; i < BlockSize / 4; ++i) {
406 out[i * 4 + 0] = bs0.read();
407 out[i * 4 + 1] = bs1.read();
408 out[i * 4 + 2] = bs2.read();
409 out[i * 4 + 3] = bs3.read();
411 Docid prev_val = out[-1];
412 for (unsigned i = 0; i < BlockSize; ++i) {
413 out[i] = prev_val = out[i] + prev_val + 1;
415 return in + bytes_for_packed_bits(BlockSize, bit_width);
418 template<unsigned BlockSize, class Docid>
419 const unsigned char *decode_for_interleaved(const unsigned char *in, Docid *out)
421 if constexpr (BlockSize == 128 && sizeof(Docid) == sizeof(uint32_t)) {
422 return decode_for_interleaved_128_32(in, out);
424 return decode_for_interleaved_generic(in, out);
428 __attribute__((target("default")))
429 const unsigned char *
430 decode_for_interleaved_128_32(const unsigned char *in, uint32_t *out)
432 return decode_for_interleaved_generic<128>(in, out);
435 #ifdef COULD_HAVE_SSE2
436 // Specialized version for SSE2.
437 __attribute__((target("sse2")))
438 const unsigned char *
439 decode_for_interleaved_128_32(const unsigned char *in, uint32_t *out)
441 constexpr unsigned BlockSize = 128;
443 const unsigned bit_width = *in++ & 0x3f;
445 in = decode_bitmap_sse2<BlockSize, /*OrWithExisting=*/false, /*DeltaDecode=*/true>(in, bit_width, out);
451 template<class Docid>
452 const unsigned char *decode_pfor_bitmap_exceptions(const unsigned char *in, unsigned num, Docid *out)
454 const unsigned exception_bit_width = *in++;
455 const uint64_t *exception_bitmap_ptr = reinterpret_cast<const uint64_t *>(in);
456 in += div_round_up(num, 8);
458 int num_exceptions = 0;
460 BitReader bs(in, exception_bit_width);
461 for (unsigned i = 0; i < num; i += 64, ++exception_bitmap_ptr) {
462 uint64_t exceptions = read_le<uint64_t>(exception_bitmap_ptr);
464 // We've read some bytes past the end, so clear out the junk bits.
465 exceptions &= (1ULL << (num - i)) - 1;
467 for (; exceptions != 0; exceptions &= exceptions - 1, ++num_exceptions) {
468 unsigned idx = (ffsll(exceptions) - 1) + i;
469 out[idx] = bs.read();
472 in += bytes_for_packed_bits(num_exceptions, exception_bit_width);
476 // PFor block with bitmap exceptions. Layout:
478 // - Bit width (6 bits) | type << 6
479 // - Exception bit width (8 bits)
480 // - Bitmap of which values have exceptions (<num> bits, rounded up to a byte)
481 // - Exceptions (<num_exc> values of <bits_exc> bits, rounded up to a byte)
482 // - Base values (<num> values of <bits> bits, rounded up to a byte)
483 template<class Docid>
484 const unsigned char *decode_pfor_bitmap(const unsigned char *in, unsigned num, Docid *out)
486 memset(out, 0, num * sizeof(Docid));
488 const unsigned bit_width = *in++ & 0x3f;
490 in = decode_pfor_bitmap_exceptions(in, num, out);
492 // Decode the base values, and delta-decode.
493 Docid prev_val = out[-1];
494 BitReader bs(in, bit_width);
495 for (unsigned i = 0; i < num; ++i) {
496 out[i] = prev_val = ((out[i] << bit_width) | bs.read()) + prev_val + 1;
498 return in + bytes_for_packed_bits(num, bit_width);
501 // Like decode_pfor_bitmap(), but the base values are organized in four
502 // independent streams, for SIMD (presumably SSE2). Supports a whole block only.
503 template<unsigned BlockSize, class Docid>
504 const unsigned char *decode_pfor_bitmap_interleaved_generic(const unsigned char *in, Docid *out)
506 memset(out, 0, BlockSize * sizeof(Docid));
508 const unsigned bit_width = *in++ & 0x3f;
510 in = decode_pfor_bitmap_exceptions(in, BlockSize, out);
512 // Decode the base values.
513 InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width);
514 InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width);
515 InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width);
516 InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width);
517 for (unsigned i = 0; i < BlockSize / 4; ++i) {
518 out[i * 4 + 0] = bs0.read() | (out[i * 4 + 0] << bit_width);
519 out[i * 4 + 1] = bs1.read() | (out[i * 4 + 1] << bit_width);
520 out[i * 4 + 2] = bs2.read() | (out[i * 4 + 2] << bit_width);
521 out[i * 4 + 3] = bs3.read() | (out[i * 4 + 3] << bit_width);
525 Docid prev_val = out[-1];
526 for (unsigned i = 0; i < BlockSize; ++i) {
527 out[i] = prev_val = out[i] + prev_val + 1;
529 return in + bytes_for_packed_bits(BlockSize, bit_width);
532 template<unsigned BlockSize, class Docid>
533 const unsigned char *decode_pfor_bitmap_interleaved(const unsigned char *in, Docid *out)
535 if constexpr (BlockSize == 128 && sizeof(Docid) == sizeof(uint32_t)) {
536 return decode_pfor_bitmap_interleaved_128_32(in, out);
538 return decode_pfor_bitmap_interleaved_generic(in, out);
542 __attribute__((target("default")))
543 const unsigned char *
544 decode_pfor_bitmap_interleaved_128_32(const unsigned char *in, uint32_t *out)
546 return decode_pfor_bitmap_interleaved_generic<128>(in, out);
549 #ifdef COULD_HAVE_SSE2
550 // Specialized version for SSE2.
551 __attribute__((target("sse2")))
552 const unsigned char *
553 decode_pfor_bitmap_interleaved_128_32(const unsigned char *in, uint32_t *out)
555 constexpr unsigned BlockSize = 128;
557 // Set all output values to zero, before the exceptions are filled in.
559 for (unsigned i = 0; i < BlockSize / 4; ++i) {
560 _mm_storeu_si128(reinterpret_cast<__m128i *>(out) + i, _mm_setzero_si128());
563 const unsigned bit_width = *in++ & 0x3f;
565 in = decode_pfor_bitmap_exceptions(in, BlockSize, out);
566 in = decode_bitmap_sse2<BlockSize, /*OrWithExisting=*/true, /*DeltaDecode=*/true>(in, bit_width, out);
572 // PFor block with variable-byte exceptions. Layout:
574 // - Bit width (6 bits) | type << 6
575 // - Number of exceptions (8 bits)
576 // - Base values (<num> values of <bits> bits, rounded up to a byte)
578 // - If first byte is 255, <num_exc> 32-bit values (does not include the 255 byte)
579 // - Else, <num_exc> varbyte-encoded values (includes the non-255 byte)
580 // - Indexes of exceptions (<num_exc> bytes).
581 template<unsigned BlockSize, class Docid>
582 const unsigned char *decode_pfor_vb(const unsigned char *in, unsigned num, Docid *out)
584 //fprintf(stderr, "in=%p out=%p num=%u\n", in, out, num);
586 const unsigned bit_width = *in++ & 0x3f;
587 unsigned num_exceptions = *in++;
589 // Decode the base values.
590 BitReader bs(in, bit_width);
591 for (unsigned i = 0; i < num; ++i) {
594 in += bytes_for_packed_bits(num, bit_width);
596 // Decode exceptions.
597 Docid exceptions[BlockSize];
600 for (unsigned i = 0; i < num_exceptions; ++i) {
601 exceptions[i] = read_le<Docid>(in);
605 for (unsigned i = 0; i < num_exceptions; ++i) {
606 in = read_vb(in, &exceptions[i]);
610 for (unsigned i = 0; i < num_exceptions; ++i) {
611 unsigned idx = *in++;
612 out[idx] |= exceptions[i] << bit_width;
616 Docid prev_val = out[-1];
617 for (unsigned i = 0; i < num; ++i) {
618 out[i] = prev_val = out[i] + prev_val + 1;
624 // Like decode_pfor_vb(), but the base values are organized in four
625 // independent streams, for SIMD (presumably SSE2). Supports a whole block only.
626 template<unsigned BlockSize, class Docid>
627 const unsigned char *decode_pfor_vb_interleaved_generic(const unsigned char *in, Docid *out)
629 const unsigned bit_width = *in++ & 0x3f;
630 unsigned num_exceptions = *in++;
632 // Decode the base values.
633 InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width);
634 InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width);
635 InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width);
636 InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width);
637 for (unsigned i = 0; i < BlockSize / 4; ++i) {
638 out[i * 4 + 0] = bs0.read();
639 out[i * 4 + 1] = bs1.read();
640 out[i * 4 + 2] = bs2.read();
641 out[i * 4 + 3] = bs3.read();
643 in += bytes_for_packed_bits(BlockSize, bit_width);
645 // Decode exceptions.
646 Docid exceptions[BlockSize];
649 for (unsigned i = 0; i < num_exceptions; ++i) {
650 exceptions[i] = read_le<Docid>(in);
654 for (unsigned i = 0; i < num_exceptions; ++i) {
655 in = read_vb(in, &exceptions[i]);
660 for (unsigned i = 0; i < num_exceptions; ++i) {
661 unsigned idx = *in++;
662 out[idx] |= exceptions[i] << bit_width;
666 Docid prev_val = out[-1];
667 for (unsigned i = 0; i < BlockSize; ++i) {
668 out[i] = prev_val = out[i] + prev_val + 1;
674 template<unsigned BlockSize, class Docid>
675 const unsigned char *decode_pfor_vb_interleaved(const unsigned char *in, Docid *out)
677 if constexpr (BlockSize == 128 && sizeof(Docid) == sizeof(uint32_t)) {
678 return decode_pfor_vb_interleaved_128_32(in, out);
680 return decode_pfor_vb_interleaved_generic(in, out);
684 __attribute__((target("default")))
685 const unsigned char *
686 decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out)
688 return decode_pfor_vb_interleaved_generic<128>(in, out);
691 // Specialized version for SSE2.
692 __attribute__((target("sse2")))
693 const unsigned char *
694 decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out)
696 constexpr unsigned BlockSize = 128;
697 using Docid = uint32_t;
699 const unsigned bit_width = *in++ & 0x3f;
700 unsigned num_exceptions = *in++;
702 // Decode the base values.
703 in = decode_bitmap_sse2<BlockSize, /*OrWithExisting=*/false, /*DeltaDecode=*/false>(in, bit_width, out);
705 // Decode exceptions.
706 Docid exceptions[BlockSize];
709 for (unsigned i = 0; i < num_exceptions; ++i) {
710 exceptions[i] = read_le<Docid>(in);
714 for (unsigned i = 0; i < num_exceptions; ++i) {
715 in = read_vb(in, &exceptions[i]);
720 for (unsigned i = 0; i < num_exceptions; ++i) {
721 unsigned idx = *in++;
722 out[idx] |= exceptions[i] << bit_width;
725 delta_decode_sse2<BlockSize>(out);
737 template<unsigned BlockSize, class Docid>
738 const unsigned char *decode_pfor_delta1(const unsigned char *in, unsigned num, bool interleaved, Docid *out)
743 in = read_baseval(in, out++);
745 for (unsigned i = 1; i < num; i += BlockSize, out += BlockSize) {
746 const unsigned num_this_block = std::min<unsigned>(num - i, BlockSize);
747 switch (in[0] >> 6) {
749 if (interleaved && num_this_block == BlockSize) {
750 dprintf("%d+%d: blocktype=%d (for, interleaved), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
751 in = decode_for_interleaved<BlockSize>(in, out);
753 dprintf("%d+%d: blocktype=%d (for), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
754 in = decode_for(in, num_this_block, out);
757 case BlockType::PFOR_VB:
758 if (interleaved && num_this_block == BlockSize) {
759 dprintf("%d+%d: blocktype=%d (pfor + vb, interleaved), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
760 in = decode_pfor_vb_interleaved<BlockSize>(in, out);
762 dprintf("%d+%d: blocktype=%d (pfor + vb), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
763 in = decode_pfor_vb<BlockSize>(in, num_this_block, out);
766 case BlockType::PFOR_BITMAP:
767 if (interleaved && num_this_block == BlockSize) {
768 dprintf("%d+%d: blocktype=%d (pfor + bitmap, interleaved), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
769 in = decode_pfor_bitmap_interleaved<BlockSize>(in, out);
771 dprintf("%d+%d: blocktype=%d (pfor + bitmap), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
772 in = decode_pfor_bitmap(in, num_this_block, out);
775 case BlockType::CONSTANT:
776 dprintf("%d+%d: blocktype=%d (constant), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
777 in = decode_constant(in, num_this_block, out);
785 #endif // !defined(_TURBOPFOR_H)