#version 440
+#extension GL_NV_gpu_shader5 : enable
// http://cbloomrants.blogspot.no/2014/02/02-11-14-understanding-ans-10.html
layout(std430, binding = 9) buffer layoutName
{
uint dist[4 * 256];
- uvec2 ransdist[4 * 256];
+ uint ransfreq[4 * 256];
+};
+
+layout(std140, binding = 12) buffer distBlock // Will become an UBO to rans.shader, thus layout std140.
+{
+ uvec4 ransdist[4 * 256];
+ uint sign_biases[4];
};
const uint prob_bits = 12;
const uint prob_scale = 1 << prob_bits;
+const uint RANS_BYTE_L = (1u << 23);
// FIXME: should come through a uniform.
const uint sums[4] = { 57600, 115200, 302400, 446400 };
new_val = new_dist[i];
+ // TODO: Why do we need this next barrier? It makes no sense.
+ memoryBarrierShared();
+ barrier();
+
for (uint layer = 2; layer <= 256; layer *= 2) {
if ((i & (layer - 1)) == layer - 1) {
new_dist[i] += new_dist[i - (layer / 2)];
memoryBarrierShared();
barrier();
}
- ransdist[base + i] = uvec2(new_dist[i] - new_val, new_val);
+
+ uint start = new_dist[i] - new_val;
+ uint freq = new_val;
+
+ uint x_max = ((RANS_BYTE_L >> (prob_bits + 1)) << 8) * freq;
+ uint cmpl_freq = ((1 << (prob_bits + 1)) - freq);
+ uint rcp_freq, rcp_shift, bias;
+ if (freq < 2) {
+ rcp_freq = ~0u;
+ rcp_shift = 0;
+ bias = start + (1 << (prob_bits + 1)) - 1;
+ } else {
+ uint shift = 0;
+ while (freq > (1u << shift)) {
+ shift++;
+ }
+
+ rcp_freq = uint(((uint64_t(1) << (shift + 31)) + freq-1) / freq);
+ rcp_shift = shift - 1;
+ bias = start;
+ }
+
+ ransfreq[base + i] = freq;
+ ransdist[base + i] = uvec4(x_max, rcp_freq, bias, (cmpl_freq << 16) | rcp_shift);
+
+ if (i == 255) {
+ sign_biases[gl_WorkGroupID.x] = new_dist[i];
+ }
}