X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=tally.shader;h=e0fb9431432493721de3d436bc57c383b3575514;hb=88ff4031f4d927b95106b8462ec4bd3c9edef718;hp=1623a1dd06e845af873ba62adcdaa177ffe87c4e;hpb=5e1d27014149311318e97b8e04a6e05ec858e57c;p=narabu

diff --git a/tally.shader b/tally.shader
index 1623a1d..e0fb943 100644
--- a/tally.shader
+++ b/tally.shader
@@ -1,4 +1,5 @@
 #version 440
+#extension GL_NV_gpu_shader5 : enable
 
 // http://cbloomrants.blogspot.no/2014/02/02-11-14-understanding-ans-10.html
 
@@ -7,11 +8,18 @@ layout(local_size_x = 256) in;
 layout(std430, binding = 9) buffer layoutName
 {
 	uint dist[4 * 256];
-	uvec2 ransdist[4 * 256];
+	uint ransfreq[4 * 256];
+};
+
+layout(std140, binding = 12) buffer distBlock  // Will become an UBO to rans.shader, thus layout std140.
+{
+	uvec4 ransdist[4 * 256];
+	uint sign_biases[4];
 };
 
 const uint prob_bits = 12;
 const uint prob_scale = 1 << prob_bits;
+const uint RANS_BYTE_L = (1u << 23);
 
 // FIXME: should come through a uniform.
 const uint sums[4] = { 57600, 115200, 302400, 446400 };
@@ -73,7 +81,7 @@ void main()
 
 	// Apply corrections one by one, greedily, until we are at the exact right sum.
 	if (actual_sum > prob_scale) {
-		float loss = -true_prob * log2(new_val / (new_val - 1));
+		float loss = true_prob * log2(new_val / float(new_val - 1));
 
 		voting_areas[i] = 0xffffffff;
 		memoryBarrierShared();
@@ -101,11 +109,11 @@ void main()
 
 			if (my_vote == voting_areas[vote_no]) {
 				--new_val;
-				loss = -true_prob * log2(new_val / (new_val - 1));
+				loss = true_prob * log2(new_val / float(new_val - 1));
 			}
 		}
 	} else {
-		float benefit = true_prob * log2(new_val / (new_val + 1));
+		float benefit = -true_prob * log2(new_val / float(new_val + 1));
 
 		voting_areas[i] = 0;
 		memoryBarrierShared();
@@ -130,7 +138,7 @@ void main()
 
 			if (my_vote == voting_areas[vote_no]) {
 				++new_val;
-				benefit = true_prob * log2(new_val / (new_val + 1));
+				benefit = -true_prob * log2(new_val / float(new_val + 1));
 			}
 		}
 	}
@@ -145,6 +153,12 @@ void main()
 	memoryBarrierShared();
 	barrier();
 
+	new_val = new_dist[i];
+
+	// TODO: Why do we need this next barrier? It makes no sense.
+	memoryBarrierShared();
+	barrier();
+
 	for (uint layer = 2; layer <= 256; layer *= 2) {
 		if ((i & (layer - 1)) == layer - 1) {
 			new_dist[i] += new_dist[i - (layer / 2)];
@@ -159,5 +173,32 @@ void main()
 		memoryBarrierShared();
 		barrier();
 	}
-	ransdist[base + i] = uvec2(new_val, new_dist[i]);
+
+	uint start = new_dist[i] - new_val;
+	uint freq = new_val;
+
+	uint x_max = ((RANS_BYTE_L >> (prob_bits + 1)) << 8) * freq;
+	uint cmpl_freq = ((1 << (prob_bits + 1)) - freq);
+	uint rcp_freq, rcp_shift, bias;
+	if (freq < 2) {
+		rcp_freq = ~0u;
+		rcp_shift = 0;
+		bias = start + (1 << (prob_bits + 1)) - 1;
+	} else {
+		uint shift = 0;
+		while (freq > (1u << shift)) {
+			shift++;
+		}
+
+		rcp_freq = uint(((uint64_t(1) << (shift + 31)) + freq-1) / freq);
+		rcp_shift = shift - 1;
+		bias = start;
+	}
+
+	ransfreq[base + i] = freq;
+	ransdist[base + i] = uvec4(x_max, rcp_freq, bias, (cmpl_freq << 16) | rcp_shift);
+
+	if (i == 255) {
+		sign_biases[gl_WorkGroupID.x] = new_dist[i];
+	}
 }