From: Steinar H. Gunderson Date: Tue, 6 Oct 2020 19:30:08 +0000 (+0200) Subject: Unroll and specialize decode_bitmap_sse2(). X-Git-Tag: 1.0.0~45 X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=58320e42bd3152aee9ebbc4743229d1eb390ec87;p=plocate Unroll and specialize decode_bitmap_sse2(). By asking GCC to unroll the loop, and specializing for the bit width using templatizing, we can get rid of a lot of the control overhead. This takes us up from 60% to 80% of reference performance, still without requiring anything more than SSE2. --- diff --git a/turbopfor.h b/turbopfor.h index 3b57d5a..cd18950 100644 --- a/turbopfor.h +++ b/turbopfor.h @@ -3,7 +3,7 @@ // A reimplementation of parts of the TurboPFor codecs, using the same // storage format. These are not as fast as the reference implementation -// (about 60% of the performance, averaged over a real plocate corpus), +// (about 80% of the performance, averaged over a real plocate corpus), // and do not support the same breadth of codecs (in particular, only // delta-plus-1 is implemented, and only 32-bit docids are tested), // but aim to be more portable and (ideally) easier-to-understand. @@ -276,13 +276,14 @@ inline void delta_decode_sse2(uint32_t *out) } } -template +template __attribute__((target("sse2"))) -const unsigned char *decode_bitmap_sse2(const unsigned char *in, unsigned bit_width, uint32_t *out) +const unsigned char *decode_bitmap_sse2_unrolled(const unsigned char *in, uint32_t *out) { __m128i *outvec = reinterpret_cast<__m128i *>(out); DeltaDecoderSSE2 delta(out[-1]); InterleavedBitReaderSSE2 bs(in, bit_width); + #pragma GCC unroll 32 for (unsigned i = 0; i < BlockSize / 4; ++i) { __m128i val = bs.read(); if constexpr (OrWithExisting) { @@ -296,6 +297,48 @@ const unsigned char *decode_bitmap_sse2(const unsigned char *in, unsigned bit_wi in += bytes_for_packed_bits(BlockSize, bit_width); return in; } + +template +__attribute__((target("sse2"))) +const unsigned char *decode_bitmap_sse2(const unsigned char *in, unsigned bit_width, uint32_t *out) +{ + switch (bit_width) { + case 0: return decode_bitmap_sse2_unrolled(in, out); + case 1: return decode_bitmap_sse2_unrolled(in, out); + case 2: return decode_bitmap_sse2_unrolled(in, out); + case 3: return decode_bitmap_sse2_unrolled(in, out); + case 4: return decode_bitmap_sse2_unrolled(in, out); + case 5: return decode_bitmap_sse2_unrolled(in, out); + case 6: return decode_bitmap_sse2_unrolled(in, out); + case 7: return decode_bitmap_sse2_unrolled(in, out); + case 8: return decode_bitmap_sse2_unrolled(in, out); + case 9: return decode_bitmap_sse2_unrolled(in, out); + case 10: return decode_bitmap_sse2_unrolled(in, out); + case 11: return decode_bitmap_sse2_unrolled(in, out); + case 12: return decode_bitmap_sse2_unrolled(in, out); + case 13: return decode_bitmap_sse2_unrolled(in, out); + case 14: return decode_bitmap_sse2_unrolled(in, out); + case 15: return decode_bitmap_sse2_unrolled(in, out); + case 16: return decode_bitmap_sse2_unrolled(in, out); + case 17: return decode_bitmap_sse2_unrolled(in, out); + case 18: return decode_bitmap_sse2_unrolled(in, out); + case 19: return decode_bitmap_sse2_unrolled(in, out); + case 20: return decode_bitmap_sse2_unrolled(in, out); + case 21: return decode_bitmap_sse2_unrolled(in, out); + case 22: return decode_bitmap_sse2_unrolled(in, out); + case 23: return decode_bitmap_sse2_unrolled(in, out); + case 24: return decode_bitmap_sse2_unrolled(in, out); + case 25: return decode_bitmap_sse2_unrolled(in, out); + case 26: return decode_bitmap_sse2_unrolled(in, out); + case 27: return decode_bitmap_sse2_unrolled(in, out); + case 28: return decode_bitmap_sse2_unrolled(in, out); + case 29: return decode_bitmap_sse2_unrolled(in, out); + case 30: return decode_bitmap_sse2_unrolled(in, out); + case 31: return decode_bitmap_sse2_unrolled(in, out); + case 32: return decode_bitmap_sse2_unrolled(in, out); + } + assert(false); +} #endif // Like decode_for(), but the values are organized in four independent streams,