]> git.sesse.net Git - plocate/blob - turbopfor.h
Fuse delta decoding into the decoding loops where appropriate.
[plocate] / turbopfor.h
1 #ifndef _TURBOPFOR_H
2 #define _TURBOPFOR_H 1
3
4 // A reimplementation of parts of the TurboPFor codecs, using the same
5 // storage format. These are not as fast as the reference implementation
6 // (about 60% of the performance, averaged over a real plocate corpus),
7 // and do not support the same breadth of codecs (in particular, only
8 // delta-plus-1 is implemented, and only 32-bit docids are tested),
9 // but aim to be more portable and (ideally) easier-to-understand.
10 // In particular, they will compile on x86 without SSE4.1 or AVX support.
11 //
12 // The main reference is https://michael.stapelberg.ch/posts/2019-02-05-turbopfor-analysis/,
13 // although some implementation details have been worked out by studying the
14 // TurboPFor code.
15
16 #include <assert.h>
17 #include <endian.h>
18 #include <stdint.h>
19 #include <string.h>
20 #include <limits.h>
21
22 #include <algorithm>
23
24 #if defined(__i386__) || defined(__x86_64__)
25 #define COULD_HAVE_SSE2
26 #include <immintrin.h>
27 #endif
28
29 // Forward declarations to declare to the template code below that they exist.
30 // (These must seemingly be non-templates for function multiversioning to work.)
31 __attribute__((target("default")))
32 const unsigned char *decode_for_interleaved_128_32(const unsigned char *in, uint32_t *out);
33 __attribute__((target("default")))
34 const unsigned char *decode_pfor_bitmap_interleaved_128_32(const unsigned char *in, uint32_t *out);
35 __attribute__((target("default")))
36 const unsigned char *decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out);
37
38 #ifdef COULD_HAVE_SSE2
39 __attribute__((target("sse2")))
40 const unsigned char *decode_for_interleaved_128_32(const unsigned char *in, uint32_t *out);
41 __attribute__((target("sse2")))
42 const unsigned char *decode_pfor_bitmap_interleaved_128_32(const unsigned char *in, uint32_t *out);
43 __attribute__((target("sse2")))
44 const unsigned char *decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out);
45 #endif
46
47 template<class Docid>
48 Docid read_le(const void *in)
49 {
50         Docid val;
51         memcpy(&val, in, sizeof(val));
52         if constexpr (sizeof(Docid) == 8) {
53                 return le64toh(val);
54         } else if constexpr (sizeof(Docid) == 4) {
55                 return le32toh(val);
56         } else if constexpr (sizeof(Docid) == 2) {
57                 return le16toh(val);
58         } else if constexpr (sizeof(Docid) == 1) {
59                 return val;
60         } else {
61                 assert(false);
62         }
63 }
64
65 // Reads a single value with an encoding that looks a bit like PrefixVarint.
66 // It's unclear why this doesn't use the varbyte encoding.
67 template<class Docid>
68 const unsigned char *read_baseval(const unsigned char *in, Docid *out)
69 {
70         //fprintf(stderr, "baseval: 0x%02x 0x%02x 0x%02x 0x%02x\n", in[0], in[1], in[2], in[3]);
71         if (*in < 128) {
72                 *out = *in;
73                 return in + 1;
74         } else if (*in < 192) {
75                 *out = ((uint32_t(in[0]) << 8) | uint32_t(in[1])) & 0x3fff;
76                 return in + 2;
77         } else if (*in < 224) {
78                 *out = ((uint32_t(in[0]) << 16) |
79                         (uint32_t(in[2]) << 8) |
80                         (uint32_t(in[1]))) & 0x1fffff;
81                 return in + 3;
82         } else {
83                 assert(false);  // Not implemented.
84         }
85 }
86
87 template<class Docid>
88 const unsigned char *read_vb(const unsigned char *in, Docid *out)
89 {
90         if (*in <= 176) {
91                 *out = *in;
92                 return in + 1;
93         } else if (*in <= 240) {
94                 *out = ((uint32_t(in[0] - 177) << 8) | uint32_t(in[1])) + 177;
95                 return in + 2;
96         } else if (*in <= 248) {
97                 *out = ((uint32_t(in[0] - 241) << 16) | read_le<uint16_t>(in + 1)) + 16561;
98                 return in + 3;
99         } else if (*in == 249) {
100                 *out = (uint32_t(in[1])) |
101                         (uint32_t(in[2]) << 8) |
102                         (uint32_t(in[3]) << 16);
103                 return in + 4;
104         } else if (*in == 250) {
105                 *out = read_le<uint32_t>(in + 1);
106                 return in + 5;
107         } else {
108                 assert(false);
109         }
110 }
111
112 struct BitReader {
113 public:
114         BitReader(const unsigned char *in, unsigned bits)
115                 : in(in), bits(bits), mask((1U << bits) - 1) {}
116         uint32_t read()
117         {
118                 uint32_t val = (read_le<uint32_t>(in) >> bits_used) & mask;
119
120                 bits_used += bits;
121                 in += bits_used / 8;
122                 bits_used %= 8;
123
124                 return val;
125         }
126
127 private:
128         const unsigned char *in;
129         const unsigned bits;
130         const unsigned mask;
131         unsigned bits_used = 0;
132 };
133
134 template<unsigned NumStreams>
135 struct InterleavedBitReader {
136 public:
137         InterleavedBitReader(const unsigned char *in, unsigned bits)
138                 : in(in), bits(bits), mask((1U << bits) - 1) {}
139         uint32_t read()
140         {
141                 uint32_t val;
142                 if (bits_used + bits > 32) {
143                         val = (read_le<uint32_t>(in) >> bits_used) | (read_le<uint32_t>(in + Stride) << (32 - bits_used));
144                 } else {
145                         val = (read_le<uint32_t>(in) >> bits_used);
146                 }
147
148                 bits_used += bits;
149                 in += Stride * (bits_used / 32);
150                 bits_used %= 32;
151
152                 return val & mask;
153         }
154
155 private:
156         static constexpr unsigned Stride = NumStreams * sizeof(uint32_t);
157         const unsigned char *in;
158         const unsigned bits;
159         const unsigned mask;
160         unsigned bits_used = 0;
161 };
162
163 // Does not properly account for overflow.
164 inline unsigned div_round_up(unsigned val, unsigned div)
165 {
166         return (val + div - 1) / div;
167 }
168
169 inline unsigned bytes_for_packed_bits(unsigned num, unsigned bit_width)
170 {
171         return div_round_up(num * bit_width, CHAR_BIT);
172 }
173
174 // Constant block. Layout:
175 //
176 //  - Bit width (6 bits) | type << 6
177 //  - Base values (<bits> bits, rounded up to nearest byte)
178 template<class Docid>
179 const unsigned char *decode_constant(const unsigned char *in, unsigned num, Docid *out)
180 {
181         const unsigned bit_width = *in++ & 0x3f;
182         Docid val = read_le<Docid>(in);
183         if (bit_width < sizeof(Docid) * 8) {
184                 val &= ((1U << bit_width) - 1);
185         }
186
187         Docid prev_val = out[-1];
188         for (unsigned i = 0; i < num; ++i) {
189                 out[i] = prev_val = val + prev_val + 1;
190         }
191         return in + div_round_up(bit_width, 8);
192 }
193
194 // FOR block (ie., PFor without exceptions). Layout:
195 //
196 //  - Bit width (6 bits) | type << 6
197 //  - Base values (<num> values of <bits> bits, rounded up to a multiple of 32 values)
198 template<class Docid>
199 const unsigned char *decode_for(const unsigned char *in, unsigned num, Docid *out)
200 {
201         const unsigned bit_width = *in++ & 0x3f;
202
203         Docid prev_val = out[-1];
204         BitReader bs(in, bit_width);
205         for (unsigned i = 0; i < num; ++i) {
206                 prev_val = out[i] = bs.read() + prev_val + 1;
207         }
208         return in + bytes_for_packed_bits(num, bit_width);
209 }
210
211 #ifdef COULD_HAVE_SSE2
212 class DeltaDecoderSSE2 {
213 public:
214         DeltaDecoderSSE2(uint32_t prev_val) : prev_val(_mm_set1_epi32(prev_val)) {}
215         __m128i decode(__m128i val) {
216                 val = _mm_add_epi32(val, _mm_slli_si128(val, 4));
217                 val = _mm_add_epi32(val, _mm_slli_si128(val, 8));
218                 val = _mm_add_epi32(val, _mm_add_epi32(prev_val, delta));
219                 prev_val = _mm_shuffle_epi32(val, _MM_SHUFFLE(3, 3, 3, 3));
220                 return val;
221         }
222
223 private:
224         // Use 4/3/2/1 as delta instead of fixed 1, so that we can do the prev_val + delta
225         // in parallel with something else.
226         const __m128i delta = _mm_set_epi32(4, 3, 2, 1);
227
228         __m128i prev_val;
229 };
230
231 template<unsigned BlockSize>
232 __attribute__((target("sse2")))
233 inline void delta_decode_sse2(uint32_t *out)
234 {
235         DeltaDecoderSSE2 delta(out[-1]);
236         __m128i *outvec = reinterpret_cast<__m128i *>(out);
237         for (unsigned i = 0; i < BlockSize / 4; ++i) {
238                 __m128i val = _mm_loadu_si128(outvec + i);
239                 _mm_storeu_si128(outvec + i, delta.decode(val));
240         }
241 }
242
243 template<unsigned BlockSize, bool OrWithExisting, bool DeltaDecode>
244 __attribute__((target("sse2")))
245 const unsigned char *decode_bitmap_sse2(const unsigned char *in, unsigned bit_width, uint32_t *out)
246 {
247         const __m128i *invec = reinterpret_cast<const __m128i *>(in);
248         __m128i *outvec = reinterpret_cast<__m128i *>(out);
249         const __m128i mask = _mm_set1_epi32((1U << bit_width) - 1);
250         unsigned bits_used = 0;
251         DeltaDecoderSSE2 delta(out[-1]);
252         for (unsigned i = 0; i < BlockSize / 4; ++i) {
253                 __m128i val = _mm_srli_epi32(_mm_loadu_si128(invec), bits_used);
254                 if (bits_used + bit_width > 32) {
255                         __m128i val_upper = _mm_slli_epi32(_mm_loadu_si128(invec + 1), 32 - bits_used);
256                         val = _mm_or_si128(val, val_upper);
257                 }
258                 val = _mm_and_si128(val, mask);
259                 if constexpr (OrWithExisting) {
260                         val = _mm_or_si128(val, _mm_loadu_si128(outvec + i));
261                 }
262                 if constexpr (DeltaDecode) {
263                         val = delta.decode(val);
264                 }
265                 _mm_storeu_si128(outvec + i, val);
266
267                 bits_used += bit_width;
268                 invec += bits_used / 32;
269                 bits_used %= 32;
270         }
271         in += bytes_for_packed_bits(BlockSize, bit_width);
272         return in;
273 }
274 #endif
275
276 // Like decode_for(), but the values are organized in four independent streams,
277 // for SIMD (presumably SSE2). Supports a whole block only.
278 template<unsigned BlockSize, class Docid>
279 const unsigned char *decode_for_interleaved_generic(const unsigned char *in, Docid *out)
280 {
281         const unsigned bit_width = *in++ & 0x3f;
282
283         InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width);
284         InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width);
285         InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width);
286         InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width);
287         for (unsigned i = 0; i < BlockSize / 4; ++i) {
288                 out[i * 4 + 0] = bs0.read();
289                 out[i * 4 + 1] = bs1.read();
290                 out[i * 4 + 2] = bs2.read();
291                 out[i * 4 + 3] = bs3.read();
292         }
293         Docid prev_val = out[-1];
294         for (unsigned i = 0; i < BlockSize; ++i) {
295                 out[i] = prev_val = out[i] + prev_val + 1;
296         }
297         return in + bytes_for_packed_bits(BlockSize, bit_width);
298 }
299
300 template<unsigned BlockSize, class Docid>
301 const unsigned char *decode_for_interleaved(const unsigned char *in, Docid *out)
302 {
303         if constexpr (BlockSize == 128 && sizeof(Docid) == sizeof(uint32_t)) {
304                 return decode_for_interleaved_128_32(in, out);
305         } else {
306                 return decode_for_interleaved_generic(in, out);
307         }
308 }
309
310 __attribute__((target("default")))
311 const unsigned char *decode_for_interleaved_128_32(const unsigned char *in, uint32_t *out)
312 {
313         return decode_for_interleaved_generic<128>(in, out);
314 }
315
316 #ifdef COULD_HAVE_SSE2
317 // Specialized version for SSE2.
318 __attribute__((target("sse2")))
319 const unsigned char *decode_for_interleaved_128_32(const unsigned char *in, uint32_t *out)
320 {
321         constexpr unsigned BlockSize = 128;
322
323         const unsigned bit_width = *in++ & 0x3f;
324
325         in = decode_bitmap_sse2<BlockSize, /*OrWithExisting=*/false, /*DeltaDecode=*/true>(in, bit_width, out);
326
327         return in;
328 }
329 #endif
330
331 template<class Docid>
332 const unsigned char *decode_pfor_bitmap_exceptions(const unsigned char *in, unsigned num, unsigned bit_width, Docid *out)
333 {
334         const unsigned exception_bit_width = *in++;
335         const uint64_t *exception_bitmap_ptr = reinterpret_cast<const uint64_t *>(in);
336         in += div_round_up(num, 8);
337
338         int num_exceptions = 0;
339
340         BitReader bs(in, exception_bit_width);
341         for (unsigned i = 0; i < num; i += 64, ++exception_bitmap_ptr) {
342                 uint64_t exceptions = read_le<uint64_t>(exception_bitmap_ptr);
343                 if (num - i < 64) {
344                         // We've read some bytes past the end, so clear out the junk bits.
345                         exceptions &= (1ULL << (num - i)) - 1;
346                 }
347                 for (; exceptions != 0; exceptions &= exceptions - 1, ++num_exceptions) {
348                         unsigned idx = (ffsll(exceptions) - 1) + i;
349                         out[idx] = bs.read() << bit_width;
350                 }
351         }
352         in += bytes_for_packed_bits(num_exceptions, exception_bit_width);
353         return in;
354 }
355
356 // PFor block with bitmap exceptions. Layout:
357 //
358 //  - Bit width (6 bits) | type << 6
359 //  - Exception bit width (8 bits)
360 //  - Bitmap of which values have exceptions (<num> bits, rounded up to a byte)
361 //  - Exceptions (<num_exc> values of <bits_exc> bits, rounded up to a byte)
362 //  - Base values (<num> values of <bits> bits, rounded up to a byte)
363 template<class Docid>
364 const unsigned char *decode_pfor_bitmap(const unsigned char *in, unsigned num, Docid *out)
365 {
366         memset(out, 0, num * sizeof(Docid));
367
368         const unsigned bit_width = *in++ & 0x3f;
369
370         in = decode_pfor_bitmap_exceptions(in, num, bit_width, out);
371
372         // Decode the base values, and delta-decode.
373         Docid prev_val = out[-1];
374         BitReader bs(in, bit_width);
375         for (unsigned i = 0; i < num; ++i) {
376                 out[i] = prev_val = (out[i] | bs.read()) + prev_val + 1;
377         }
378         return in + bytes_for_packed_bits(num, bit_width);
379 }
380
381 // Like decode_pfor_bitmap(), but the base values are organized in four
382 // independent streams, for SIMD (presumably SSE2). Supports a whole block only.
383 template<unsigned BlockSize, class Docid>
384 const unsigned char *decode_pfor_bitmap_interleaved_generic(const unsigned char *in, Docid *out)
385 {
386         memset(out, 0, BlockSize * sizeof(Docid));
387
388         const unsigned bit_width = *in++ & 0x3f;
389
390         in = decode_pfor_bitmap_exceptions(in, BlockSize, bit_width, out);
391
392         // Decode the base values.
393         InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width);
394         InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width);
395         InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width);
396         InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width);
397         for (unsigned i = 0; i < BlockSize / 4; ++i) {
398                 out[i * 4 + 0] |= bs0.read();
399                 out[i * 4 + 1] |= bs1.read();
400                 out[i * 4 + 2] |= bs2.read();
401                 out[i * 4 + 3] |= bs3.read();
402         }
403
404         // Delta-decode.
405         Docid prev_val = out[-1];
406         for (unsigned i = 0; i < BlockSize; ++i) {
407                 out[i] = prev_val = out[i] + prev_val + 1;
408         }
409         return in + bytes_for_packed_bits(BlockSize, bit_width);
410 }
411
412 template<unsigned BlockSize, class Docid>
413 const unsigned char *decode_pfor_bitmap_interleaved(const unsigned char *in, Docid *out)
414 {
415         if constexpr (BlockSize == 128 && sizeof(Docid) == sizeof(uint32_t)) {
416                 return decode_pfor_bitmap_interleaved_128_32(in, out);
417         } else {
418                 return decode_pfor_bitmap_interleaved_generic(in, out);
419         }
420 }
421
422 __attribute__((target("default")))
423 const unsigned char *decode_pfor_bitmap_interleaved_128_32(const unsigned char *in, uint32_t *out)
424 {
425         return decode_pfor_bitmap_interleaved_generic<128>(in, out);
426 }
427
428 #ifdef COULD_HAVE_SSE2
429 // Specialized version for SSE2.
430 __attribute__((target("sse2")))
431 const unsigned char *decode_pfor_bitmap_interleaved_128_32(const unsigned char *in, uint32_t *out)
432 {
433         constexpr unsigned BlockSize = 128;
434         using Docid = uint32_t;
435
436         memset(out, 0, BlockSize * sizeof(Docid));
437
438         const unsigned bit_width = *in++ & 0x3f;
439
440         in = decode_pfor_bitmap_exceptions(in, BlockSize, bit_width, out);
441         in = decode_bitmap_sse2<BlockSize, /*OrWithExisting=*/true, /*DeltaDecode=*/true>(in, bit_width, out);
442
443         return in;
444 }
445 #endif
446
447 // PFor block with variable-byte exceptions. Layout:
448 //
449 //  - Bit width (6 bits) | type << 6
450 //  - Number of exceptions (8 bits)
451 //  - Base values (<num> values of <bits> bits, rounded up to a byte)
452 //  - Exceptions:
453 //    - If first byte is 255, <num_exc> 32-bit values (does not include the 255 byte)
454 //    - Else, <num_exc> varbyte-encoded values (includes the non-255 byte)
455 //  - Indexes of exceptions (<num_exc> bytes).
456 template<unsigned BlockSize, class Docid>
457 const unsigned char *decode_pfor_vb(const unsigned char *in, unsigned num, Docid *out)
458 {
459         //fprintf(stderr, "in=%p out=%p num=%u\n", in, out, num);
460
461         const unsigned bit_width = *in++ & 0x3f;
462         unsigned num_exceptions = *in++;
463
464         // Decode the base values.
465         BitReader bs(in, bit_width);
466         for (unsigned i = 0; i < num; ++i) {
467                 out[i] = bs.read();
468         }
469         in += bytes_for_packed_bits(num, bit_width);
470
471         // Decode exceptions.
472         Docid exceptions[BlockSize];
473         if (*in == 255) {
474                 ++in;
475                 for (unsigned i = 0; i < num_exceptions; ++i) {
476                         exceptions[i] = read_le<Docid>(in);
477                         in += sizeof(Docid);
478                 }
479         } else {
480                 for (unsigned i = 0; i < num_exceptions; ++i) {
481                         in = read_vb(in, &exceptions[i]);
482                 }
483         }
484         // Apply exceptions.
485         for (unsigned i = 0; i < num_exceptions; ++i) {
486                 unsigned idx = *in++;
487                 out[idx] |= exceptions[i] << bit_width;
488         }
489
490         // Delta-decode.
491         Docid prev_val = out[-1];
492         for (unsigned i = 0; i < num; ++i) {
493                 out[i] = prev_val = out[i] + prev_val + 1;
494         }
495
496         return in;
497 }
498
499 // Like decode_pfor_vb(), but the base values are organized in four
500 // independent streams, for SIMD (presumably SSE2). Supports a whole block only.
501 template<unsigned BlockSize, class Docid>
502 const unsigned char *decode_pfor_vb_interleaved_generic(const unsigned char *in, Docid *out)
503 {
504         const unsigned bit_width = *in++ & 0x3f;
505         unsigned num_exceptions = *in++;
506
507         // Decode the base values.
508         InterleavedBitReader<4> bs0(in + 0 * sizeof(uint32_t), bit_width);
509         InterleavedBitReader<4> bs1(in + 1 * sizeof(uint32_t), bit_width);
510         InterleavedBitReader<4> bs2(in + 2 * sizeof(uint32_t), bit_width);
511         InterleavedBitReader<4> bs3(in + 3 * sizeof(uint32_t), bit_width);
512         for (unsigned i = 0; i < BlockSize / 4; ++i) {
513                 out[i * 4 + 0] = bs0.read();
514                 out[i * 4 + 1] = bs1.read();
515                 out[i * 4 + 2] = bs2.read();
516                 out[i * 4 + 3] = bs3.read();
517         }
518         in += bytes_for_packed_bits(BlockSize, bit_width);
519
520         // Decode exceptions.
521         Docid exceptions[BlockSize];
522         if (*in == 255) {
523                 ++in;
524                 for (unsigned i = 0; i < num_exceptions; ++i) {
525                         exceptions[i] = read_le<Docid>(in);
526                         in += sizeof(Docid);
527                 }
528         } else {
529                 for (unsigned i = 0; i < num_exceptions; ++i) {
530                         in = read_vb(in, &exceptions[i]);
531                 }
532         }
533
534         // Apply exceptions.
535         for (unsigned i = 0; i < num_exceptions; ++i) {
536                 unsigned idx = *in++;
537                 out[idx] |= exceptions[i] << bit_width;
538         }
539
540         // Delta-decode.
541         Docid prev_val = out[-1];
542         for (unsigned i = 0; i < BlockSize; ++i) {
543                 out[i] = prev_val = out[i] + prev_val + 1;
544         }
545
546         return in;
547 }
548
549 template<unsigned BlockSize, class Docid>
550 const unsigned char *decode_pfor_vb_interleaved(const unsigned char *in, Docid *out)
551 {
552         if constexpr (BlockSize == 128 && sizeof(Docid) == sizeof(uint32_t)) {
553                 return decode_pfor_vb_interleaved_128_32(in, out);
554         } else {
555                 return decode_pfor_vb_interleaved_generic(in, out);
556         }
557 }
558
559 __attribute__((target("default")))
560 const unsigned char *decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out)
561 {
562         return decode_pfor_vb_interleaved_generic<128>(in, out);
563 }
564
565 // Specialized version for SSE2.
566 __attribute__((target("sse2")))
567 const unsigned char *decode_pfor_vb_interleaved_128_32(const unsigned char *in, uint32_t *out)
568 {
569         constexpr unsigned BlockSize = 128;
570         using Docid = uint32_t;
571
572         const unsigned bit_width = *in++ & 0x3f;
573         unsigned num_exceptions = *in++;
574
575         // Decode the base values.
576         in = decode_bitmap_sse2<BlockSize, /*OrWithExisting=*/false, /*DeltaDecode=*/false>(in, bit_width, out);
577
578         // Decode exceptions.
579         Docid exceptions[BlockSize];
580         if (*in == 255) {
581                 ++in;
582                 for (unsigned i = 0; i < num_exceptions; ++i) {
583                         exceptions[i] = read_le<Docid>(in);
584                         in += sizeof(Docid);
585                 }
586         } else {
587                 for (unsigned i = 0; i < num_exceptions; ++i) {
588                         in = read_vb(in, &exceptions[i]);
589                 }
590         }
591
592         // Apply exceptions.
593         for (unsigned i = 0; i < num_exceptions; ++i) {
594                 unsigned idx = *in++;
595                 out[idx] |= exceptions[i] << bit_width;
596         }
597
598         delta_decode_sse2<BlockSize>(out);
599
600         return in;
601 }
602
603 enum BlockType {
604         FOR = 0,
605         PFOR_VB = 1,
606         PFOR_BITMAP = 2,
607         CONSTANT = 3
608 };
609
610 template<unsigned BlockSize, class Docid>
611 const unsigned char *decode_pfor_delta1(const unsigned char *in, unsigned num, bool interleaved, Docid *out)
612 {
613         if (num == 0) {
614                 return in;
615         }
616         in = read_baseval(in, out++);
617
618         for (unsigned i = 1; i < num; i += BlockSize, out += BlockSize) {
619                 const unsigned num_this_block = std::min<unsigned>(num - i, BlockSize);
620                 switch (in[0] >> 6) {
621                 case BlockType::FOR:
622                         if (interleaved && num_this_block == BlockSize) {
623                                 dprintf("%d+%d: blocktype=%d (for, interleaved), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
624                                 in = decode_for_interleaved<BlockSize>(in, out);
625                         } else {
626                                 dprintf("%d+%d: blocktype=%d (for), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
627                                 in = decode_for(in, num_this_block, out);
628                         }
629                         break;
630                 case BlockType::PFOR_VB:
631                         if (interleaved && num_this_block == BlockSize) {
632                                 dprintf("%d+%d: blocktype=%d (pfor + vb, interleaved), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
633                                 in = decode_pfor_vb_interleaved<BlockSize>(in, out);
634                         } else {
635                                 dprintf("%d+%d: blocktype=%d (pfor + vb), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
636                                 in = decode_pfor_vb<BlockSize>(in, num_this_block, out);
637                         }
638                         break;
639                 case BlockType::PFOR_BITMAP:
640                         if (interleaved && num_this_block == BlockSize) {
641                                 dprintf("%d+%d: blocktype=%d (pfor + bitmap, interleaved), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
642                                 in = decode_pfor_bitmap_interleaved<BlockSize>(in, out);
643                         } else {
644                                 dprintf("%d+%d: blocktype=%d (pfor + bitmap), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
645                                 in = decode_pfor_bitmap(in, num_this_block, out);
646                         }
647                         break;
648                 case BlockType::CONSTANT:
649                         dprintf("%d+%d: blocktype=%d (constant), bitwidth=%d\n", i, num_this_block, in[0] >> 6, in[0] & 0x3f);
650                         in = decode_constant(in, num_this_block, out);
651                         break;
652                 }
653         }
654
655         return in;
656 }
657
658 #endif  // !defined(_TURBOPFOR_H)