#version 440
+#extension GL_ARB_gpu_shader_int64 : enable
// http://cbloomrants.blogspot.no/2014/02/02-11-14-understanding-ans-10.html
layout(std430, binding = 9) buffer layoutName
{
uint dist[4 * 256];
+ uint ransfreq[4 * 256];
+};
+
+layout(std140, binding = 12) buffer distBlock // Will become an UBO to rans.shader, thus layout std140.
+{
+ uvec4 ransdist[4 * 256];
+ uint sign_biases[4];
};
const uint prob_bits = 12;
const uint prob_scale = 1 << prob_bits;
+const uint RANS_BYTE_L = (1u << 23);
// FIXME: should come through a uniform.
const uint sums[4] = { 57600, 115200, 302400, 446400 };
// Apply corrections one by one, greedily, until we are at the exact right sum.
if (actual_sum > prob_scale) {
- float loss = -true_prob * log2(new_val / (new_val - 1));
+ float loss = true_prob * log2(new_val / float(new_val - 1));
voting_areas[i] = 0xffffffff;
memoryBarrierShared();
if (my_vote == voting_areas[vote_no]) {
--new_val;
- loss = -true_prob * log2(new_val / (new_val - 1));
+ loss = true_prob * log2(new_val / float(new_val - 1));
}
}
} else {
- float benefit = true_prob * log2(new_val / (new_val + 1));
+ float benefit = -true_prob * log2(new_val / float(new_val + 1));
voting_areas[i] = 0;
memoryBarrierShared();
if (my_vote == voting_areas[vote_no]) {
++new_val;
- benefit = true_prob * log2(new_val / (new_val + 1));
+ benefit = -true_prob * log2(new_val / float(new_val + 1));
}
}
}
memoryBarrierShared();
barrier();
+ new_val = new_dist[i];
+
+ // TODO: Why do we need this next barrier? It makes no sense.
+ memoryBarrierShared();
+ barrier();
+
for (uint layer = 2; layer <= 256; layer *= 2) {
if ((i & (layer - 1)) == layer - 1) {
new_dist[i] += new_dist[i - (layer / 2)];
memoryBarrierShared();
barrier();
}
- dist[base + i] = new_dist[i];
+
+ uint start = new_dist[i] - new_val;
+ uint freq = new_val;
+
+ uint x_max = ((RANS_BYTE_L >> (prob_bits + 1)) << 8) * freq;
+ uint cmpl_freq = ((1 << (prob_bits + 1)) - freq);
+ uint rcp_freq, rcp_shift, bias;
+ if (freq < 2) {
+ rcp_freq = ~0u;
+ rcp_shift = 0;
+ bias = start + (1 << (prob_bits + 1)) - 1;
+ } else {
+ uint shift = 0;
+ while (freq > (1u << shift)) {
+ shift++;
+ }
+
+ rcp_freq = uint(((uint64_t(1) << (shift + 31)) + freq-1) / freq);
+ rcp_shift = shift - 1;
+ bias = start;
+ }
+
+ ransfreq[base + i] = freq;
+ ransdist[base + i] = uvec4(x_max, rcp_freq, bias, (cmpl_freq << 16) | rcp_shift);
+
+ if (i == 255) {
+ sign_biases[gl_WorkGroupID.x] = new_dist[i];
+ }
}