X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=tally.shader;h=e0fb9431432493721de3d436bc57c383b3575514;hb=88ff4031f4d927b95106b8462ec4bd3c9edef718;hp=1623a1dd06e845af873ba62adcdaa177ffe87c4e;hpb=5e1d27014149311318e97b8e04a6e05ec858e57c;p=narabu diff --git a/tally.shader b/tally.shader index 1623a1d..e0fb943 100644 --- a/tally.shader +++ b/tally.shader @@ -1,4 +1,5 @@ #version 440 +#extension GL_NV_gpu_shader5 : enable // http://cbloomrants.blogspot.no/2014/02/02-11-14-understanding-ans-10.html @@ -7,11 +8,18 @@ layout(local_size_x = 256) in; layout(std430, binding = 9) buffer layoutName { uint dist[4 * 256]; - uvec2 ransdist[4 * 256]; + uint ransfreq[4 * 256]; +}; + +layout(std140, binding = 12) buffer distBlock // Will become an UBO to rans.shader, thus layout std140. +{ + uvec4 ransdist[4 * 256]; + uint sign_biases[4]; }; const uint prob_bits = 12; const uint prob_scale = 1 << prob_bits; +const uint RANS_BYTE_L = (1u << 23); // FIXME: should come through a uniform. const uint sums[4] = { 57600, 115200, 302400, 446400 }; @@ -73,7 +81,7 @@ void main() // Apply corrections one by one, greedily, until we are at the exact right sum. if (actual_sum > prob_scale) { - float loss = -true_prob * log2(new_val / (new_val - 1)); + float loss = true_prob * log2(new_val / float(new_val - 1)); voting_areas[i] = 0xffffffff; memoryBarrierShared(); @@ -101,11 +109,11 @@ void main() if (my_vote == voting_areas[vote_no]) { --new_val; - loss = -true_prob * log2(new_val / (new_val - 1)); + loss = true_prob * log2(new_val / float(new_val - 1)); } } } else { - float benefit = true_prob * log2(new_val / (new_val + 1)); + float benefit = -true_prob * log2(new_val / float(new_val + 1)); voting_areas[i] = 0; memoryBarrierShared(); @@ -130,7 +138,7 @@ void main() if (my_vote == voting_areas[vote_no]) { ++new_val; - benefit = true_prob * log2(new_val / (new_val + 1)); + benefit = -true_prob * log2(new_val / float(new_val + 1)); } } } @@ -145,6 +153,12 @@ void main() memoryBarrierShared(); barrier(); + new_val = new_dist[i]; + + // TODO: Why do we need this next barrier? It makes no sense. + memoryBarrierShared(); + barrier(); + for (uint layer = 2; layer <= 256; layer *= 2) { if ((i & (layer - 1)) == layer - 1) { new_dist[i] += new_dist[i - (layer / 2)]; @@ -159,5 +173,32 @@ void main() memoryBarrierShared(); barrier(); } - ransdist[base + i] = uvec2(new_val, new_dist[i]); + + uint start = new_dist[i] - new_val; + uint freq = new_val; + + uint x_max = ((RANS_BYTE_L >> (prob_bits + 1)) << 8) * freq; + uint cmpl_freq = ((1 << (prob_bits + 1)) - freq); + uint rcp_freq, rcp_shift, bias; + if (freq < 2) { + rcp_freq = ~0u; + rcp_shift = 0; + bias = start + (1 << (prob_bits + 1)) - 1; + } else { + uint shift = 0; + while (freq > (1u << shift)) { + shift++; + } + + rcp_freq = uint(((uint64_t(1) << (shift + 31)) + freq-1) / freq); + rcp_shift = shift - 1; + bias = start; + } + + ransfreq[base + i] = freq; + ransdist[base + i] = uvec4(x_max, rcp_freq, bias, (cmpl_freq << 16) | rcp_shift); + + if (i == 255) { + sign_biases[gl_WorkGroupID.x] = new_dist[i]; + } }