Start trying to count the rANS distributions from the encoding shader.

author Steinar H. Gunderson <sgunderson@bigfoot.com>

Tue, 10 Oct 2017 16:04:07 +0000 (18:04 +0200)

committer Steinar H. Gunderson <sgunderson@bigfoot.com>

Tue, 10 Oct 2017 16:04:07 +0000 (18:04 +0200)
author Steinar H. Gunderson <sgunderson@bigfoot.com>
Tue, 10 Oct 2017 16:04:07 +0000 (18:04 +0200)
committer Steinar H. Gunderson <sgunderson@bigfoot.com>
Tue, 10 Oct 2017 16:04:07 +0000 (18:04 +0200)
diff --git a/coded.dat b/coded.dat

index e94ba8c933720642879d621f143fa276b1602e0d..469306fbd75145cd0a0cc60dcf9b86fc8e9bc3f5 100644 (file)

Binary files a/coded.dat and b/coded.dat differ
diff --git a/encoder.shader b/encoder.shader

index dc17453fdcd0ef596728e82cc513c15ed7f4bbf3..da9e6c07b2c730274513087a6ffaa5239146b29e 100644 (file)
--- a/encoder.shader
+++ b/encoder.shader
@@ -12,6 +12,24 @@ layout(r8ui) uniform restrict readonly uimage2D image_tex;
  
  shared float temp[64];
  
  
  shared float temp[64];
  
+layout(std430, binding = 9) buffer layoutName
+{
+       uint dist[4][256];
+};
+
+#define MAPPING(s0, s1, s2, s3, s4, s5, s6, s7) ((s0) | (s1 << 2) | (s2 << 4) | (s3 << 6) | (s4 << 8) | (s5 << 10) | (s6 << 12) | (s7 << 14))
+
+const uint luma_mapping[8] = {
+       MAPPING(0, 0, 1, 1, 2, 2, 3, 3),
+       MAPPING(0, 0, 1, 2, 2, 2, 3, 3),
+       MAPPING(1, 1, 2, 2, 2, 3, 3, 3),
+       MAPPING(1, 1, 2, 2, 2, 3, 3, 3),
+       MAPPING(1, 2, 2, 2, 2, 3, 3, 3),
+       MAPPING(2, 2, 2, 2, 3, 3, 3, 3),
+       MAPPING(2, 2, 3, 3, 3, 3, 3, 3),
+       MAPPING(3, 3, 3, 3, 3, 3, 3, 3),
+};
+
  // Scale factors; 1.0 / (sqrt(2.0) * cos(k * M_PI / 16.0)), except for the first which is 1.
  const float sf[8] = {
         1.0, 0.7209598220069479, 0.765366864730180, 0.8504300947672564,
  // Scale factors; 1.0 / (sqrt(2.0) * cos(k * M_PI / 16.0)), except for the first which is 1.
  const float sf[8] = {
         1.0, 0.7209598220069479, 0.765366864730180, 0.8504300947672564,
@@ -166,5 +184,39 @@ void main()
         imageStore(ac2_ac5_tex, ivec2(sx, y + n), uvec4(pack_9_7(c2, c5), 0, 0, 0));
         imageStore(ac3_tex,     ivec2(sx, y + n), ivec4(c3, 0, 0, 0));
         imageStore(ac4_tex,     ivec2(sx, y + n), ivec4(c4, 0, 0, 0));
         imageStore(ac2_ac5_tex, ivec2(sx, y + n), uvec4(pack_9_7(c2, c5), 0, 0, 0));
         imageStore(ac3_tex,     ivec2(sx, y + n), ivec4(c3, 0, 0, 0));
         imageStore(ac4_tex,     ivec2(sx, y + n), ivec4(c4, 0, 0, 0));
+
+       // Count frequencies, but only for every 8th block or so, randomly selected.
+       uint wg_index = gl_WorkGroupID.y * gl_WorkGroupSize.x + gl_WorkGroupID.x;
+       if ((wg_index * 0x9E3779B9u) >> 29 == 0) {  // Fibonacci hashing, essentially a PRNG in this context.
+               c0 = min(abs(c0), 255);
+               c1 = min(abs(c1), 255);
+               c2 = min(abs(c2), 255);
+               c3 = min(abs(c3), 255);
+               c4 = min(abs(c4), 255);
+               c5 = min(abs(c5), 255);
+               c6 = min(abs(c6), 255);
+               c7 = min(abs(c7), 255);
+
+               // Spread out the most popular elements among the cache lines by reversing the bits
+               // of the index, reducing false sharing.
+               c0 = bitfieldReverse(c0) >> 24;
+               c1 = bitfieldReverse(c1) >> 24;
+               c2 = bitfieldReverse(c2) >> 24;
+               c3 = bitfieldReverse(c3) >> 24;
+               c4 = bitfieldReverse(c4) >> 24;
+               c5 = bitfieldReverse(c5) >> 24;
+               c6 = bitfieldReverse(c6) >> 24;
+               c7 = bitfieldReverse(c7) >> 24;
+
+               uint m = luma_mapping[n];
+               atomicAdd(dist[bitfieldExtract(m,  0, 2)][c0], 1);
+               atomicAdd(dist[bitfieldExtract(m,  2, 2)][c1], 1);
+               atomicAdd(dist[bitfieldExtract(m,  4, 2)][c2], 1);
+               atomicAdd(dist[bitfieldExtract(m,  6, 2)][c3], 1);
+               atomicAdd(dist[bitfieldExtract(m,  8, 2)][c4], 1);
+               atomicAdd(dist[bitfieldExtract(m, 10, 2)][c5], 1);
+               atomicAdd(dist[bitfieldExtract(m, 12, 2)][c6], 1);
+               atomicAdd(dist[bitfieldExtract(m, 14, 2)][c7], 1);
+       }
  }
  
  }
  
diff --git a/narabu-encoder.cpp b/narabu-encoder.cpp

index 2ddd8992fd2fe0da3b56bd2b0ec90598ceb93461..e8d20e042af141d2f404c3f4a676bd463ea4918b 100644 (file)
--- a/narabu-encoder.cpp
+++ b/narabu-encoder.cpp
@@ -334,6 +334,13 @@ int main(int argc, char **argv)
  
         glUseProgram(glsl_program_num);
  
  
         glUseProgram(glsl_program_num);
  
+       // An SSBO for the rANS distributions.
+       GLuint ssbo;
+       glGenBuffers(1, &ssbo);
+       glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
+       glBufferData(GL_SHADER_STORAGE_BUFFER, 65536 * 4 * sizeof(uint32_t), nullptr, GL_DYNAMIC_COPY);
+       glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
+
         // Upload luma.
         GLuint y_tex;
         glGenTextures(1, &y_tex);
         // Upload luma.
         GLuint y_tex;
         glGenTextures(1, &y_tex);
@@ -392,7 +399,7 @@ int main(int argc, char **argv)
         check_error();
  
         steady_clock::time_point start = steady_clock::now();
         check_error();
  
         steady_clock::time_point start = steady_clock::now();
-       unsigned num_iterations = 1000;
+       unsigned num_iterations = 100;
         for (unsigned i = 0; i < num_iterations; ++i) {
                 glDispatchCompute(WIDTH_BLOCKS, HEIGHT_BLOCKS, 1);
         }
         for (unsigned i = 0; i < num_iterations; ++i) {
                 glDispatchCompute(WIDTH_BLOCKS, HEIGHT_BLOCKS, 1);
         }
@@ -575,4 +582,11 @@ int main(int argc, char **argv)
         printf("\n");
         printf("Each iteration took %.3f ms (but note that is DCT only, no rANS).\n", 1e3 * duration<double>(now - start).count() / num_iterations);
  
         printf("\n");
         printf("Each iteration took %.3f ms (but note that is DCT only, no rANS).\n", 1e3 * duration<double>(now - start).count() / num_iterations);
  
+#if 1
+       glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
+       const uint32_t *dist = (const uint32_t *)glMapBuffer(GL_SHADER_STORAGE_BUFFER, GL_READ_ONLY);
+       for (int i = 0; i < 1024; ++i) {
+               printf("%d,%d: %u\n", i / 256, i % 256, dist[i] / num_iterations);
+       }
+#endif
  }
  }
author	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Tue, 10 Oct 2017 16:04:07 +0000 (18:04 +0200)
committer	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Tue, 10 Oct 2017 16:04:07 +0000 (18:04 +0200)
coded.dat		patch \| blob \| history
encoder.shader		patch \| blob \| history
narabu-encoder.cpp		patch \| blob \| history