1 // 64-bit rANS encoder/decoder - public domain - Fabian 'ryg' Giesen 2014
3 // This uses 64-bit states (63-bit actually) which allows renormalizing
4 // by writing out a whole 32 bits at a time (b=2^32) while still
5 // retaining good precision and allowing for high probability resolution.
7 // The only caveat is that this version requires 64-bit arithmetic; in
8 // particular, the encoder approximation in the bottom half requires a
9 // fast way to obtain the top 64 bits of an unsigned 64*64 bit product.
11 // In short, as written, this code works on 64-bit targets only!
19 #define Rans64Assert assert
21 #define Rans64Assert(x)
24 // --------------------------------------------------------------------------
26 // This code needs support for 64-bit long multiplies with 128-bit result
27 // (or more precisely, the top 64 bits of a 128-bit result). This is not
28 // really portable functionality, so we need some compiler-specific hacks
35 static inline uint64_t Rans64MulHi(uint64_t a, uint64_t b)
40 #elif defined(__GNUC__)
42 static inline uint64_t Rans64MulHi(uint64_t a, uint64_t b)
44 return (uint64_t) (((unsigned __int128)a * b) >> 64);
49 #error Unknown/unsupported compiler!
53 // --------------------------------------------------------------------------
55 // L ('l' in the paper) is the lower bound of our normalization interval.
56 // Between this and our 32-bit-aligned emission, we use 63 (not 64!) bits.
57 // This is done intentionally because exact reciprocals for 63-bit uints
58 // fit in 64-bit uints: this permits some optimizations during encoding.
59 #define RANS64_L (1ull << 31) // lower bound of our normalization interval
61 // State for a rANS encoder. Yep, that's all there is to it.
62 typedef uint64_t Rans64State;
64 // Initialize a rANS encoder.
65 static inline void Rans64EncInit(Rans64State* r)
70 // Encodes a single symbol with range start "start" and frequency "freq".
71 // All frequencies are assumed to sum to "1 << scale_bits", and the
72 // resulting bytes get written to ptr (which is updated).
74 // NOTE: With rANS, you need to encode symbols in *reverse order*, i.e. from
75 // beginning to end! Likewise, the output bytestream is written *backwards*:
76 // ptr starts pointing at the end of the output buffer and keeps decrementing.
77 static inline void Rans64EncPut(Rans64State* r, uint32_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits)
79 Rans64Assert(freq != 0);
81 // renormalize (never needs to loop)
83 uint64_t x_max = ((RANS64_L >> scale_bits) << 32) * freq; // this turns into a shift.
86 **pptr = (uint32_t) x;
88 Rans64Assert(x < x_max);
92 *r = ((x / freq) << scale_bits) + (x % freq) + start;
95 // Flushes the rANS encoder.
96 static inline void Rans64EncFlush(Rans64State* r, uint32_t** pptr)
101 (*pptr)[0] = (uint32_t) (x >> 0);
102 (*pptr)[1] = (uint32_t) (x >> 32);
105 // Initializes a rANS decoder.
106 // Unlike the encoder, the decoder works forwards as you'd expect.
107 static inline void Rans64DecInit(Rans64State* r, uint32_t** pptr)
111 x = (uint64_t) ((*pptr)[0]) << 0;
112 x |= (uint64_t) ((*pptr)[1]) << 32;
117 // Returns the current cumulative frequency (map it to a symbol yourself!)
118 static inline uint32_t Rans64DecGet(Rans64State* r, uint32_t scale_bits)
120 return *r & ((1u << scale_bits) - 1);
123 // Advances in the bit stream by "popping" a single symbol with range start
124 // "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits",
125 // and the resulting bytes get written to ptr (which is updated).
126 static inline void Rans64DecAdvance(Rans64State* r, uint32_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits)
128 uint64_t mask = (1ull << scale_bits) - 1;
132 x = freq * (x >> scale_bits) + (x & mask) - start;
136 x = (x << 32) | **pptr;
138 Rans64Assert(x >= RANS64_L);
144 // --------------------------------------------------------------------------
146 // That's all you need for a full encoder; below here are some utility
147 // functions with extra convenience or optimizations.
149 // Encoder symbol description
150 // This (admittedly odd) selection of parameters was chosen to make
151 // RansEncPutSymbol as cheap as possible.
153 uint64_t rcp_freq; // Fixed-point reciprocal frequency
154 uint32_t freq; // Symbol frequency
155 uint32_t bias; // Bias
156 uint32_t cmpl_freq; // Complement of frequency: (1 << scale_bits) - freq
157 uint32_t rcp_shift; // Reciprocal shift
160 // Decoder symbols are straightforward.
162 uint32_t start; // Start of range.
163 uint32_t freq; // Symbol frequency.
166 // Initializes an encoder symbol to start "start" and frequency "freq"
167 static inline void Rans64EncSymbolInit(Rans64EncSymbol* s, uint32_t start, uint32_t freq, uint32_t scale_bits)
169 Rans64Assert(scale_bits <= 31);
170 Rans64Assert(start <= (1u << scale_bits));
171 Rans64Assert(freq <= (1u << scale_bits) - start);
173 // Say M := 1 << scale_bits.
175 // The original encoder does:
176 // x_new = (x/freq)*M + start + (x%freq)
178 // The fast encoder does (schematically):
179 // q = mul_hi(x, rcp_freq) >> rcp_shift (division)
180 // r = x - q*freq (remainder)
181 // x_new = q*M + bias + r (new x)
182 // plugging in r into x_new yields:
183 // x_new = bias + x + q*(M - freq)
184 // =: bias + x + q*cmpl_freq (*)
186 // and we can just precompute cmpl_freq. Now we just need to
187 // set up our parameters such that the original encoder and
188 // the fast encoder agree.
191 s->cmpl_freq = ((1 << scale_bits) - freq);
193 // freq=0 symbols are never valid to encode, so it doesn't matter what
194 // we set our values to.
196 // freq=1 is tricky, since the reciprocal of 1 is 1; unfortunately,
197 // our fixed-point reciprocal approximation can only multiply by values
200 // So we use the "next best thing": rcp_freq=~0, rcp_shift=0.
202 // q = mul_hi(x, rcp_freq) >> rcp_shift
203 // = mul_hi(x, (1<<64) - 1)) >> 0
204 // = floor(x - x/(2^64))
205 // = x - 1 if 1 <= x < 2^64
206 // and we know that x>0 (x=0 is never in a valid normalization interval).
208 // So we now need to choose the other parameters such that
209 // x_new = x*M + start
211 // x*M + start (desired result)
212 // = bias + x + q*cmpl_freq (*)
213 // = bias + x + (x - 1)*(M - 1) (plug in q=x-1, cmpl_freq)
214 // = bias + 1 + (x - 1)*M
215 // = x*M + (bias + 1 - M)
217 // so we have start = bias + 1 - M, or equivalently
218 // bias = start + M - 1.
221 s->bias = start + (1 << scale_bits) - 1;
223 // Alverson, "Integer Division using reciprocals"
224 // shift=ceil(log2(freq))
226 uint64_t x0, x1, t0, t1;
227 while (freq > (1u << shift))
230 // long divide ((uint128) (1 << (shift + 63)) + freq-1) / freq
231 // by splitting it into two 64:64 bit divides (this works because
232 // the dividend has a simple form.)
234 x1 = 1ull << (shift + 31);
237 x0 += (x1 % freq) << 32;
240 s->rcp_freq = t0 + (t1 << 32);
241 s->rcp_shift = shift - 1;
243 // With these values, 'q' is the correct quotient, so we
249 // Initialize a decoder symbol to start "start" and frequency "freq"
250 static inline void Rans64DecSymbolInit(Rans64DecSymbol* s, uint32_t start, uint32_t freq)
252 Rans64Assert(start <= (1 << 31));
253 Rans64Assert(freq <= (1 << 31) - start);
258 // Encodes a given symbol. This is faster than straight RansEnc since we can do
259 // multiplications instead of a divide.
261 // See RansEncSymbolInit for a description of how this works.
262 static inline void Rans64EncPutSymbol(Rans64State* r, uint32_t** pptr, Rans64EncSymbol const* sym, uint32_t scale_bits)
264 Rans64Assert(sym->freq != 0); // can't encode symbol with freq=0
268 uint64_t x_max = ((RANS64_L >> scale_bits) << 32) * sym->freq; // turns into a shift
271 **pptr = (uint32_t) x;
276 uint64_t q = Rans64MulHi(x, sym->rcp_freq) >> sym->rcp_shift;
277 *r = x + sym->bias + q * sym->cmpl_freq;
280 // Equivalent to RansDecAdvance that takes a symbol.
281 static inline void Rans64DecAdvanceSymbol(Rans64State* r, uint32_t** pptr, Rans64DecSymbol const* sym, uint32_t scale_bits)
283 Rans64DecAdvance(r, pptr, sym->start, sym->freq, scale_bits);
286 // Advances in the bit stream by "popping" a single symbol with range start
287 // "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits".
288 // No renormalization or output happens.
289 static inline void Rans64DecAdvanceStep(Rans64State* r, uint32_t start, uint32_t freq, uint32_t scale_bits)
291 uint64_t mask = (1u << scale_bits) - 1;
295 *r = freq * (x >> scale_bits) + (x & mask) - start;
298 // Equivalent to RansDecAdvanceStep that takes a symbol.
299 static inline void Rans64DecAdvanceSymbolStep(Rans64State* r, Rans64DecSymbol const* sym, uint32_t scale_bits)
301 Rans64DecAdvanceStep(r, sym->start, sym->freq, scale_bits);
305 static inline void Rans64DecRenorm(Rans64State* r, uint32_t** pptr)
310 x = (x << 32) | **pptr;
312 Rans64Assert(x >= RANS64_L);
318 #endif // RANS64_HEADER