Add rANS normalization to the encoder.

author Steinar H. Gunderson <sgunderson@bigfoot.com>

Thu, 12 Oct 2017 17:02:07 +0000 (19:02 +0200)

committer Steinar H. Gunderson <sgunderson@bigfoot.com>

Thu, 12 Oct 2017 17:04:00 +0000 (19:04 +0200)
author Steinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 12 Oct 2017 17:02:07 +0000 (19:02 +0200)
committer Steinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 12 Oct 2017 17:04:00 +0000 (19:04 +0200)
diff --git a/narabu-encoder.cpp b/narabu-encoder.cpp

index 730135978c34c3bb604bcad6d3b893df0b2e7d16..a4e77169b88d2df283f9b08c16fbc12c24f43431 100644 (file)
--- a/narabu-encoder.cpp
+++ b/narabu-encoder.cpp
@@ -332,13 +332,28 @@ int main(int argc, char **argv)
                 exit(1);
         }
  
+       // Compile the tally shader.
+       shader_src = ::read_file("tally.shader");
+       shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
+       GLuint glsl_tally_program_num = glCreateProgram();
+       glAttachShader(glsl_tally_program_num, shader_num);
+       glLinkProgram(glsl_tally_program_num);
+
+       glGetProgramiv(glsl_tally_program_num, GL_LINK_STATUS, &success);
+       if (success == GL_FALSE) {
+               GLchar error_log[1024] = {0};
+               glGetProgramInfoLog(glsl_tally_program_num, 1024, nullptr, error_log);
+               fprintf(stderr, "Error linking program: %s\n", error_log);
+               exit(1);
+       }
+
         glUseProgram(glsl_program_num);
  
         // An SSBO for the rANS distributions.
         GLuint ssbo;
         glGenBuffers(1, &ssbo);
         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
-       glBufferData(GL_SHADER_STORAGE_BUFFER, 65536 * 4 * sizeof(uint32_t), nullptr, GL_DYNAMIC_COPY);
+       glBufferData(GL_SHADER_STORAGE_BUFFER, 256 * 4 * sizeof(uint32_t), nullptr, GL_DYNAMIC_COPY);
         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
  
         // Upload luma.
@@ -398,10 +413,19 @@ int main(int argc, char **argv)
         glBindImageTexture(5, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
         check_error();
  
+       glUseProgram(glsl_tally_program_num);
+       glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
+
         steady_clock::time_point start = steady_clock::now();
-       unsigned num_iterations = 100;
+       unsigned num_iterations = 1000;
         for (unsigned i = 0; i < num_iterations; ++i) {
+               glClearNamedBufferSubData(ssbo, GL_R8, 0, 256 * 4 * sizeof(uint32_t), GL_RED, GL_UNSIGNED_BYTE, nullptr);
+               glUseProgram(glsl_program_num);
                 glDispatchCompute(WIDTH_BLOCKS / 16, HEIGHT_BLOCKS, 1);
+               glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+
+               glUseProgram(glsl_tally_program_num);
+               glDispatchCompute(4, 1, 1);
         }
         check_error();
         glFinish();
@@ -450,7 +474,7 @@ int main(int argc, char **argv)
                 }
         }
  
-#if 1
+#if 0
         for (unsigned y = 0; y < HEIGHT; ++y) {
                 for (unsigned xb = 0; xb < WIDTH/8; ++xb) {
                         printf("%4d %4d %4d %4d %4d %4d %4d %4d | ",
@@ -586,7 +610,7 @@ int main(int argc, char **argv)
         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
         const uint32_t *dist = (const uint32_t *)glMapBuffer(GL_SHADER_STORAGE_BUFFER, GL_READ_ONLY);
         for (int i = 0; i < 1024; ++i) {
-               printf("%d,%d: %u\n", i / 256, i % 256, dist[i] / num_iterations);
+               printf("%d,%d: %u\n", i / 256, i % 256, dist[i]);
         }
  #endif
  }
diff --git a/tally.shader b/tally.shader

new file mode 100644 (file)

index 0000000..c83bfe4
--- /dev/null
+++ b/tally.shader
@@ -0,0 +1,162 @@
+#version 440
+
+// http://cbloomrants.blogspot.no/2014/02/02-11-14-understanding-ans-10.html
+
+layout(local_size_x = 256) in;
+
+layout(std430, binding = 9) buffer layoutName
+{
+       uint dist[4 * 256];
+};
+
+const uint prob_bits = 12;
+const uint prob_scale = 1 << prob_bits;
+
+// FIXME: should come through a uniform.
+const uint sums[4] = { 57600, 115200, 302400, 446400 };
+
+shared uint new_dist[256];
+shared uint shared_tmp = 0;
+
+// Multiple voting areas for the min/max reductions, so we don't need to clear them
+// for every round.
+shared uint voting_areas[256];
+
+void main()
+{
+       uint base = 256 * gl_WorkGroupID.x;
+       uint sum = sums[gl_WorkGroupID.x];
+       uint zero_correction;
+       uint i = gl_LocalInvocationID.x;  // Which element we are working on.
+
+       // Zero has no sign bit. Yes, this is trickery.
+       if (i == 0) {
+               uint old_zero_freq = dist[base];
+               uint new_zero_freq = (old_zero_freq + 1) / 2;  // Rounding up ensures we get a zero frequency.
+               zero_correction = old_zero_freq - new_zero_freq;
+               shared_tmp = sum - zero_correction;
+               dist[base] = new_zero_freq;
+       }
+
+       memoryBarrierShared();
+       barrier();
+
+       sum = shared_tmp;
+
+       // Normalize the pdf, taking care to never make a nonzero frequency into a zero.
+       // This rounding is presumably optimal according to cbloom.
+       float true_prob = float(dist[base + i]) / sum;
+       float from_scaled = true_prob * prob_scale;
+       float down = floor(from_scaled);
+       uint new_val = (from_scaled * from_scaled <= down * (down + 1)) ? uint(down) : uint(down) + 1;
+       if (dist[base + i] > 0) {
+               new_val = max(new_val, 1);
+       }
+
+       // Reset shared_tmp. We could do without this barrier if we wanted to,
+       // but meh. :-)
+       if (i == 0) {
+               shared_tmp = 0;
+       }
+
+       memoryBarrierShared();
+       barrier();
+
+       // Parallel sum to find the total.
+       atomicAdd(shared_tmp, new_val);
+
+       memoryBarrierShared();
+       barrier();
+
+       uint actual_sum = shared_tmp;
+
+       // Apply corrections one by one, greedily, until we are at the exact right sum.
+       if (actual_sum > prob_scale) {
+               float loss = -true_prob * log2(new_val / (new_val - 1));
+
+               voting_areas[i] = 0xffffffff;
+               memoryBarrierShared();
+               barrier();
+
+               uint vote_no = 0;
+
+               for ( ; actual_sum != prob_scale; --actual_sum, ++vote_no) {
+                       memoryBarrierShared();
+                       barrier();
+
+                       // Stick the thread ID in the lower mantissa bits so we never get a tie.
+                       uint my_vote = (floatBitsToUint(loss) & ~0xff) | gl_LocalInvocationID.x;
+                       if (new_val <= 1) {
+                               // We can't touch this one any more, but it needs to participate in the barriers,
+                               // so we can't break.
+                               my_vote = 0xffffffff;
+                       }
+
+                       // Find out which thread has the one with the smallest loss.
+                       // (Positive floats compare like uints just fine.)
+                       voting_areas[vote_no] = atomicMin(voting_areas[vote_no], my_vote);
+                       memoryBarrierShared();
+                       barrier();
+
+                       if (my_vote == voting_areas[vote_no]) {
+                               --new_val;
+                               loss = -true_prob * log2(new_val / (new_val - 1));
+                       }
+               }
+       } else {
+               float benefit = true_prob * log2(new_val / (new_val + 1));
+
+               voting_areas[i] = 0;
+               memoryBarrierShared();
+               barrier();
+
+               uint vote_no = 0;
+
+               for ( ; actual_sum != prob_scale; ++actual_sum, ++vote_no) {
+                       // Stick the thread ID in the lower mantissa bits so we never get a tie.
+                       uint my_vote = (floatBitsToUint(benefit) & ~0xff) | gl_LocalInvocationID.x;
+                       if (new_val == 0) {
+                               // It's meaningless to increase this, but it needs to participate in the barriers,
+                               // so we can't break.
+                               my_vote = 0;
+                       }
+
+                       // Find out which thread has the one with the most benefit.
+                       // (Positive floats compare like uints just fine.)
+                       voting_areas[vote_no] = atomicMax(voting_areas[vote_no], my_vote);
+                       memoryBarrierShared();
+                       barrier();
+
+                       if (my_vote == voting_areas[vote_no]) {
+                               ++new_val;
+                               benefit = true_prob * log2(new_val / (new_val + 1));
+                       }
+               }
+       }
+
+       if (i == 0) {
+               // Undo what we did above.
+               new_val *= 2;
+       }
+
+       // Parallel prefix sum.
+       new_dist[(i + 255) & 255] = new_val;  // Move the zero symbol last.
+       memoryBarrierShared();
+       barrier();
+
+       for (uint layer = 2; layer <= 256; layer *= 2) {
+               if ((i & (layer - 1)) == layer - 1) {
+                       new_dist[i] += new_dist[i - (layer / 2)];
+               }
+               memoryBarrierShared();
+               barrier();
+       }
+       for (uint layer = 128; layer >= 2; layer /= 2) {
+               if ((i & (layer - 1)) == layer - 1 && i != 255) {
+                       new_dist[i + (layer / 2)] += new_dist[i];
+               }
+               memoryBarrierShared();
+               barrier();
+       }
+       dist[base + i] = new_dist[i];
+}
author	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Thu, 12 Oct 2017 17:02:07 +0000 (19:02 +0200)
committer	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Thu, 12 Oct 2017 17:04:00 +0000 (19:04 +0200)
narabu-encoder.cpp		patch \| blob \| history
tally.shader	[new file with mode: 0644]	patch \| blob