]> git.sesse.net Git - narabu/blobdiff - narabu-encoder.cpp
Pull the rANS distributions into uniforms instead of SSBOs. Speeds up stuff a bit.
[narabu] / narabu-encoder.cpp
index ca285282110480de405eadd04acf495898d2be6c..b53cfc074e778bac541c8a522de1b5c81f7e55b5 100644 (file)
@@ -41,13 +41,19 @@ unsigned char pix_y[WIDTH * HEIGHT];
 unsigned char pix_cb[(WIDTH/2) * HEIGHT];
 unsigned char pix_cr[(WIDTH/2) * HEIGHT];
 
-struct RansDistSSBO {
+struct RansCountSSBO {
        unsigned dist[4 * 256];
        unsigned ransfreq[4 * 256];
+};
+
+struct RansDistUBO {
        struct {
                uint32_t x_max, rcp_freq, bias, rcp_shift_and_cmpl_freq;
        } ransdist[4 * 256];
-       unsigned sign_biases[4];
+       struct {
+               uint32_t val;
+               uint32_t padding[3];  // std140 layout.
+       } sign_biases[4];
 };
 
 using namespace std;
@@ -194,11 +200,24 @@ int main(int argc, char **argv)
        }
        check_error();
 
-       // An SSBO for the rANS distributions.
+       // An SSBO for the raw rANS counts.
        GLuint ssbo;
        glGenBuffers(1, &ssbo);
        glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
-       glNamedBufferStorage(ssbo, sizeof(RansDistSSBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       glNamedBufferStorage(ssbo, sizeof(RansCountSSBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       check_error();
+
+       // UBO for the rANS distributions (copied from an SSBO).
+       GLuint dist_ssbo;
+       glGenBuffers(1, &dist_ssbo);
+       glBindBuffer(GL_SHADER_STORAGE_BUFFER, dist_ssbo);
+       glNamedBufferStorage(dist_ssbo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       check_error();
+
+       GLuint dist_ubo;
+       glGenBuffers(1, &dist_ubo);
+       glBindBuffer(GL_UNIFORM_BUFFER, dist_ubo);
+       glNamedBufferStorage(dist_ubo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
        check_error();
 
        // SSBOs for the rANS output (data and offsets).
@@ -220,11 +239,12 @@ int main(int argc, char **argv)
 
        glUseProgram(glsl_tally_program_num);
        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
+       glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 12, dist_ssbo);
 
        glUseProgram(glsl_rans_program_num);
-       glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, output_ssbo);
        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, output_offset_ssbo);
+       glBindBufferBase(GL_UNIFORM_BUFFER, 13, dist_ubo);
 
        glUseProgram(glsl_program_num);
        check_error();
@@ -304,7 +324,7 @@ int main(int argc, char **argv)
        steady_clock::time_point start = steady_clock::now();
        unsigned num_iterations = 100;
        for (unsigned i = 0; i < num_iterations; ++i) {
-               glClearNamedBufferSubData(ssbo, GL_R8, 0, sizeof(RansDistSSBO), GL_RED, GL_UNSIGNED_BYTE, nullptr);
+               glClearNamedBufferSubData(ssbo, GL_R8, 0, sizeof(RansCountSSBO), GL_RED, GL_UNSIGNED_BYTE, nullptr);
                glUseProgram(glsl_program_num);
                glDispatchCompute(WIDTH_BLOCKS / 16, HEIGHT_BLOCKS, 1);
                glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
@@ -313,6 +333,9 @@ int main(int argc, char **argv)
                glDispatchCompute(4, 1, 1);
                glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
        
+               glCopyNamedBufferSubData(dist_ssbo, dist_ubo, 0, 0, sizeof(RansDistUBO));
+               glMemoryBarrier(GL_UNIFORM_BARRIER_BIT);
+
                glUseProgram(glsl_rans_program_num);
                glDispatchCompute(NUM_BLOCKS / BLOCKS_PER_STREAM, 8, 5);
        }
@@ -340,20 +363,21 @@ int main(int argc, char **argv)
        }
 
        // Write out the distributions.
-       const RansDistSSBO *rans_dist = (const RansDistSSBO *)glMapNamedBufferRange(ssbo, 0, sizeof(RansDistSSBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       const RansCountSSBO *rans_count = (const RansCountSSBO *)glMapNamedBufferRange(ssbo, 0, sizeof(RansCountSSBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       const RansDistUBO *rans_dist = (const RansDistUBO *)glMapNamedBufferRange(dist_ssbo, 0, sizeof(RansDistUBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
        for (unsigned r = 0; r < 2; ++r) {  // Hack to write fake chroma tables.
                // TODO: rather gamma-k or something
                for (unsigned i = 0; i < 4; ++i) {
                        printf("writing table %d\n", i);
                        for (unsigned j = 0; j < NUM_SYMS; ++j) {
                                printf("%d,%d: freq=%d  x_max=%d, rcp_freq=%08x, bias=%d, rcp_shift=%d, cmpl_freq=%d\n",
-                                       i, j, rans_dist->ransfreq[i * 256 + j],
+                                       i, j, rans_count->ransfreq[i * 256 + j],
                                        rans_dist->ransdist[i * 256 + j].x_max,
                                        rans_dist->ransdist[i * 256 + j].rcp_freq,
                                        rans_dist->ransdist[i * 256 + j].bias,
                                        rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq & 0xffff,
                                        rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq >> 16);
-                               write_varint(rans_dist->ransfreq[i * 256 + j], codedfp);
+                               write_varint(rans_count->ransfreq[i * 256 + j], codedfp);
                        }
                }
        }