git.sesse.net Git - narabu/blob - ryg_rans/rans_word_sse41.h

   1 // Word-aligned SSE 4.1 rANS encoder/decoder - public domain - Fabian 'ryg' Giesen
   2 //
   3 // This implementation has a regular rANS encoder and a 4-way interleaved SIMD
   4 // decoder. Like rans_byte.h, it's intended to illustrate the idea, not to
   5 // be used as a drop-in arithmetic coder.
   6
   7 #ifndef RANS_WORD_SSE41_HEADER
   8 #define RANS_WORD_SSE41_HEADER
   9
  10 #include <stdint.h>
  11 #include <smmintrin.h>
  12
  13 #include "platform.h"
  14
  15 // READ ME FIRST:
  16 //
  17 // The intention in this version is to demonstrate a design where the decoder
  18 // is made as fast as possible, even when it makes the encoder slightly slower
  19 // or hurts compression a bit. (The code in rans_byte.h, with the 31-bit
  20 // arithmetic to allow for faster division by constants, is a more "balanced"
  21 // approach).
  22 //
  23 // This version is intended to be used with relatively low-resolution
  24 // probability distributions (scale_bits=12 or less). In these regions, the
  25 // "fully unrolled" table-based approach shown here (suggested by "enotuss"
  26 // on my blog) is optimal; for larger scale_bits, other approaches are more
  27 // favorable. It also only assumes an 8-bit symbol alphabet for simplicity.
  28 //
  29 // Unlike rans_byte.h, this file needs to be compiled as C++.
  30
  31 // --------------------------------------------------------------------------
  32
  33 // This coder uses L=1<<16 and B=1<<16 (16-bit word based renormalization).
  34 // Since we still continue to use 32-bit words, this means we require
  35 // scale_bits <= 16; on the plus side, renormalization never needs to
  36 // iterate.
  37 #define RANS_WORD_L (1u << 16)
  38
  39 #define RANS_WORD_SCALE_BITS 12
  40 #define RANS_WORD_M (1u << RANS_WORD_SCALE_BITS)
  41
  42 #define RANS_WORD_NSYMS 256
  43
  44 typedef uint32_t RansWordEnc;
  45 typedef uint32_t RansWordDec;
  46
  47 typedef union {
  48     __m128i simd;
  49     uint32_t lane[4];
  50 } RansSimdDec;
  51
  52 union RansWordSlot {
  53     uint32_t u32;
  54     struct {
  55         uint16_t freq;
  56         uint16_t bias;
  57     };
  58 };
  59
  60 struct RansWordTables {
  61     RansWordSlot slots[RANS_WORD_M];
  62     uint8_t slot2sym[RANS_WORD_M];
  63 };
  64
  65 // Initialize slots for a symbol in the table
  66 static inline void RansWordTablesInitSymbol(RansWordTables* tab, uint8_t sym, uint32_t start, uint32_t freq)
  67 {
  68     for (uint32_t i=0; i < freq; i++) {
  69         uint32_t slot = start + i;
  70         tab->slot2sym[slot] = sym;
  71         tab->slots[slot].freq = (uint16_t)freq;
  72         tab->slots[slot].bias = (uint16_t)i;
  73     }
  74 }
  75
  76 // Initialize a rANS encoder
  77 static inline RansWordEnc RansWordEncInit()
  78 {
  79     return RANS_WORD_L;
  80 }
  81
  82 // Encodes a single symbol with range "start" and frequency "freq".
  83 static inline void RansWordEncPut(RansWordEnc* r, uint16_t** pptr, uint32_t start, uint32_t freq)
  84 {
  85     // renormalize
  86     uint32_t x = *r;
  87     if (x >= ((RANS_WORD_L >> RANS_WORD_SCALE_BITS) << 16) * freq) {
  88         *pptr -= 1;
  89         **pptr = (uint16_t) (x & 0xffff);
  90         x >>= 16;
  91     }
  92
  93     // x = C(s,x)
  94     *r = ((x / freq) << RANS_WORD_SCALE_BITS) + (x % freq) + start;
  95 }
  96
  97 // Flushes the rANS encoder
  98 static inline void RansWordEncFlush(RansWordEnc* r, uint16_t** pptr)
  99 {
 100     uint32_t x = *r;
 101     uint16_t* ptr = *pptr;
 102
 103     ptr -= 2;
 104     ptr[0] = (uint16_t) (x >> 0);
 105     ptr[1] = (uint16_t) (x >> 16);
 106
 107     *pptr = ptr;
 108 }
 109
 110 // Initializes a rANS decoder.
 111 static inline void RansWordDecInit(RansWordDec* r, uint16_t** pptr)
 112 {
 113     uint32_t x;
 114     uint16_t* ptr = *pptr;
 115
 116     x  = ptr[0] << 0;
 117     x |= ptr[1] << 16;
 118     ptr += 2;
 119
 120     *pptr = ptr;
 121     *r = x;
 122 }
 123
 124 // Decodes a symbol using the given tables.
 125 static inline uint8_t RansWordDecSym(RansWordDec* r, RansWordTables const* tab)
 126 {
 127     uint32_t x = *r;
 128     uint32_t slot = x & (RANS_WORD_M - 1);
 129
 130     // s, x = D(x)
 131     *r = tab->slots[slot].freq * (x >> RANS_WORD_SCALE_BITS) + tab->slots[slot].bias;
 132     return tab->slot2sym[slot];
 133 }
 134
 135 // Renormalize after decoding a symbol.
 136 static inline void RansWordDecRenorm(RansWordDec* r, uint16_t** pptr)
 137 {
 138     uint32_t x = *r;
 139     if (x < RANS_WORD_L) {
 140         *r = (x << 16) | **pptr;
 141         *pptr += 1;
 142     }
 143 }
 144
 145 // Initializes a SIMD rANS decoder.
 146 static inline void RansSimdDecInit(RansSimdDec* r, uint16_t** pptr)
 147 {
 148     r->simd = _mm_loadu_si128((const __m128i*)*pptr);
 149     *pptr += 2*4;
 150 }
 151
 152 // Decodes a four symbols in parallel using the given tables.
 153 static inline uint32_t RansSimdDecSym(RansSimdDec* r, RansWordTables const* tab)
 154 {
 155     __m128i freq_bias_lo, freq_bias_hi, freq_bias;
 156     __m128i freq, bias;
 157     __m128i xscaled;
 158     __m128i x = r->simd;
 159     __m128i slots = _mm_and_si128(x, _mm_set1_epi32(RANS_WORD_M - 1));
 160     uint32_t i0 = (uint32_t) _mm_cvtsi128_si32(slots);
 161     uint32_t i1 = (uint32_t) _mm_extract_epi32(slots, 1);
 162     uint32_t i2 = (uint32_t) _mm_extract_epi32(slots, 2);
 163     uint32_t i3 = (uint32_t) _mm_extract_epi32(slots, 3);
 164
 165     // symbol
 166     uint32_t s = tab->slot2sym[i0] | (tab->slot2sym[i1] << 8) | (tab->slot2sym[i2] << 16) | (tab->slot2sym[i3] << 24);
 167
 168     // gather freq_bias
 169     freq_bias_lo = _mm_cvtsi32_si128(tab->slots[i0].u32);
 170     freq_bias_lo = _mm_insert_epi32(freq_bias_lo, tab->slots[i1].u32, 1);
 171     freq_bias_hi = _mm_cvtsi32_si128(tab->slots[i2].u32);
 172     freq_bias_hi = _mm_insert_epi32(freq_bias_hi, tab->slots[i3].u32, 1);
 173     freq_bias = _mm_unpacklo_epi64(freq_bias_lo, freq_bias_hi);
 174
 175     // s, x = D(x)
 176     xscaled = _mm_srli_epi32(x, RANS_WORD_SCALE_BITS);
 177     freq = _mm_and_si128(freq_bias, _mm_set1_epi32(0xffff));
 178     bias = _mm_srli_epi32(freq_bias, 16);
 179     r->simd = _mm_add_epi32(_mm_mullo_epi32(xscaled, freq), bias);
 180     return s;
 181 }
 182
 183 // Renormalize after decoding a symbol.
 184 static inline void RansSimdDecRenorm(RansSimdDec* r, uint16_t** pptr)
 185 {
 186     static ALIGNSPEC(int8_t const, shuffles[16][16], 16) = {
 187 #define _ -1 // for readability
 188         { _,_,_,_, _,_,_,_, _,_,_,_, _,_,_,_ }, // 0000
 189         { 0,1,_,_, _,_,_,_, _,_,_,_, _,_,_,_ }, // 0001
 190         { _,_,_,_, 0,1,_,_, _,_,_,_, _,_,_,_ }, // 0010
 191         { 0,1,_,_, 2,3,_,_, _,_,_,_, _,_,_,_ }, // 0011
 192         { _,_,_,_, _,_,_,_, 0,1,_,_, _,_,_,_ }, // 0100
 193         { 0,1,_,_, _,_,_,_, 2,3,_,_, _,_,_,_ }, // 0101
 194         { _,_,_,_, 0,1,_,_, 2,3,_,_, _,_,_,_ }, // 0110
 195         { 0,1,_,_, 2,3,_,_, 4,5,_,_, _,_,_,_ }, // 0111
 196         { _,_,_,_, _,_,_,_, _,_,_,_, 0,1,_,_ }, // 1000
 197         { 0,1,_,_, _,_,_,_, _,_,_,_, 2,3,_,_ }, // 1001
 198         { _,_,_,_, 0,1,_,_, _,_,_,_, 2,3,_,_ }, // 1010
 199         { 0,1,_,_, 2,3,_,_, _,_,_,_, 4,5,_,_ }, // 1011
 200         { _,_,_,_, _,_,_,_, 0,1,_,_, 2,3,_,_ }, // 1100
 201         { 0,1,_,_, _,_,_,_, 2,3,_,_, 4,5,_,_ }, // 1101
 202         { _,_,_,_, 0,1,_,_, 2,3,_,_, 4,5,_,_ }, // 1110
 203         { 0,1,_,_, 2,3,_,_, 4,5,_,_, 6,7,_,_ }, // 1111
 204 #undef _
 205     };
 206     static uint8_t const numbits[16] = {
 207         0,1,1,2, 1,2,2,3, 1,2,2,3, 2,3,3,4
 208     };
 209
 210     __m128i x = r->simd;
 211
 212     // NOTE: SSE2+ only offer a signed 32-bit integer compare, while we
 213     // need unsigned. So we subtract 0x80000000 before the compare,
 214     // which converts unsigned integers to signed integers in an
 215     // order-preserving manner.
 216     __m128i x_biased = _mm_xor_si128(x, _mm_set1_epi32((int) 0x80000000));
 217     __m128i greater = _mm_cmpgt_epi32(_mm_set1_epi32(RANS_WORD_L - 0x80000000), x_biased);
 218     unsigned int mask = _mm_movemask_ps(_mm_castsi128_ps(greater));
 219
 220     // NOTE: this will read slightly past the end of the input buffer.
 221     // In practice, either pad the input buffer by 8 bytes at the end,
 222     // or switch to the non-SIMD version once you get close to the end.
 223     __m128i memvals = _mm_loadl_epi64((const __m128i*)*pptr);
 224     __m128i xshifted = _mm_slli_epi32(x, 16);
 225     __m128i shufmask = _mm_load_si128((const __m128i*)shuffles[mask]);
 226     __m128i newx = _mm_or_si128(xshifted, _mm_shuffle_epi8(memvals, shufmask));
 227     r->simd = _mm_blendv_epi8(x, newx, greater);
 228     *pptr += numbits[mask];
 229 }
 230
 231 #endif // RANS_WORD_SSE41_HEADER
 232