]> git.sesse.net Git - narabu/blobdiff - narabu-encoder.cpp
Speed up the histogram counting immensely by adding via local memory.
[narabu] / narabu-encoder.cpp
index 9e9f0ab8c036904717a79a5a2a6decfec07a3934..730135978c34c3bb604bcad6d3b893df0b2e7d16 100644 (file)
@@ -334,6 +334,13 @@ int main(int argc, char **argv)
 
        glUseProgram(glsl_program_num);
 
+       // An SSBO for the rANS distributions.
+       GLuint ssbo;
+       glGenBuffers(1, &ssbo);
+       glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
+       glBufferData(GL_SHADER_STORAGE_BUFFER, 65536 * 4 * sizeof(uint32_t), nullptr, GL_DYNAMIC_COPY);
+       glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
+
        // Upload luma.
        GLuint y_tex;
        glGenTextures(1, &y_tex);
@@ -342,7 +349,7 @@ int main(int argc, char **argv)
         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
-        glTexImage2D(GL_TEXTURE_2D, 0, GL_R8I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_UNSIGNED_BYTE, pix_y);
+        glTexImage2D(GL_TEXTURE_2D, 0, GL_R8UI, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_UNSIGNED_BYTE, pix_y);
        check_error();
 
        // Make destination textures.
@@ -388,13 +395,13 @@ int main(int argc, char **argv)
        glBindImageTexture(2, ac2_ac5_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
        glBindImageTexture(3, ac3_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I);
        glBindImageTexture(4, ac4_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I);
-       glBindImageTexture(5, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I);
+       glBindImageTexture(5, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
        check_error();
 
        steady_clock::time_point start = steady_clock::now();
-       unsigned num_iterations = 1000;
+       unsigned num_iterations = 100;
        for (unsigned i = 0; i < num_iterations; ++i) {
-               glDispatchCompute(WIDTH_BLOCKS, HEIGHT_BLOCKS, 1);
+               glDispatchCompute(WIDTH_BLOCKS / 16, HEIGHT_BLOCKS, 1);
        }
        check_error();
        glFinish();
@@ -455,6 +462,15 @@ int main(int argc, char **argv)
                                coeff_y[y * WIDTH + xb*8 + 5],
                                coeff_y[y * WIDTH + xb*8 + 6],
                                coeff_y[y * WIDTH + xb*8 + 7]);
+                       printf("%4d %4d %4d %4d %4d %4d %4d %4d || ",
+                               pix_y[y * WIDTH + xb*8 + 0],
+                               pix_y[y * WIDTH + xb*8 + 1],
+                               pix_y[y * WIDTH + xb*8 + 2],
+                               pix_y[y * WIDTH + xb*8 + 3],
+                               pix_y[y * WIDTH + xb*8 + 4],
+                               pix_y[y * WIDTH + xb*8 + 5],
+                               pix_y[y * WIDTH + xb*8 + 6],
+                               pix_y[y * WIDTH + xb*8 + 7]);
                }
                printf("\n");
        }
@@ -566,4 +582,11 @@ int main(int argc, char **argv)
        printf("\n");
        printf("Each iteration took %.3f ms (but note that is DCT only, no rANS).\n", 1e3 * duration<double>(now - start).count() / num_iterations);
 
+#if 1
+       glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
+       const uint32_t *dist = (const uint32_t *)glMapBuffer(GL_SHADER_STORAGE_BUFFER, GL_READ_ONLY);
+       for (int i = 0; i < 1024; ++i) {
+               printf("%d,%d: %u\n", i / 256, i % 256, dist[i] / num_iterations);
+       }
+#endif
 }