]> git.sesse.net Git - narabu/commitdiff
Pull the rANS distributions into uniforms instead of SSBOs. Speeds up stuff a bit.
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Tue, 17 Oct 2017 20:33:54 +0000 (22:33 +0200)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Tue, 17 Oct 2017 20:33:54 +0000 (22:33 +0200)
narabu-encoder.cpp
rans.shader
tally.shader

index ca285282110480de405eadd04acf495898d2be6c..b53cfc074e778bac541c8a522de1b5c81f7e55b5 100644 (file)
@@ -41,13 +41,19 @@ unsigned char pix_y[WIDTH * HEIGHT];
 unsigned char pix_cb[(WIDTH/2) * HEIGHT];
 unsigned char pix_cr[(WIDTH/2) * HEIGHT];
 
-struct RansDistSSBO {
+struct RansCountSSBO {
        unsigned dist[4 * 256];
        unsigned ransfreq[4 * 256];
+};
+
+struct RansDistUBO {
        struct {
                uint32_t x_max, rcp_freq, bias, rcp_shift_and_cmpl_freq;
        } ransdist[4 * 256];
-       unsigned sign_biases[4];
+       struct {
+               uint32_t val;
+               uint32_t padding[3];  // std140 layout.
+       } sign_biases[4];
 };
 
 using namespace std;
@@ -194,11 +200,24 @@ int main(int argc, char **argv)
        }
        check_error();
 
-       // An SSBO for the rANS distributions.
+       // An SSBO for the raw rANS counts.
        GLuint ssbo;
        glGenBuffers(1, &ssbo);
        glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
-       glNamedBufferStorage(ssbo, sizeof(RansDistSSBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       glNamedBufferStorage(ssbo, sizeof(RansCountSSBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       check_error();
+
+       // UBO for the rANS distributions (copied from an SSBO).
+       GLuint dist_ssbo;
+       glGenBuffers(1, &dist_ssbo);
+       glBindBuffer(GL_SHADER_STORAGE_BUFFER, dist_ssbo);
+       glNamedBufferStorage(dist_ssbo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       check_error();
+
+       GLuint dist_ubo;
+       glGenBuffers(1, &dist_ubo);
+       glBindBuffer(GL_UNIFORM_BUFFER, dist_ubo);
+       glNamedBufferStorage(dist_ubo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
        check_error();
 
        // SSBOs for the rANS output (data and offsets).
@@ -220,11 +239,12 @@ int main(int argc, char **argv)
 
        glUseProgram(glsl_tally_program_num);
        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
+       glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 12, dist_ssbo);
 
        glUseProgram(glsl_rans_program_num);
-       glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, output_ssbo);
        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, output_offset_ssbo);
+       glBindBufferBase(GL_UNIFORM_BUFFER, 13, dist_ubo);
 
        glUseProgram(glsl_program_num);
        check_error();
@@ -304,7 +324,7 @@ int main(int argc, char **argv)
        steady_clock::time_point start = steady_clock::now();
        unsigned num_iterations = 100;
        for (unsigned i = 0; i < num_iterations; ++i) {
-               glClearNamedBufferSubData(ssbo, GL_R8, 0, sizeof(RansDistSSBO), GL_RED, GL_UNSIGNED_BYTE, nullptr);
+               glClearNamedBufferSubData(ssbo, GL_R8, 0, sizeof(RansCountSSBO), GL_RED, GL_UNSIGNED_BYTE, nullptr);
                glUseProgram(glsl_program_num);
                glDispatchCompute(WIDTH_BLOCKS / 16, HEIGHT_BLOCKS, 1);
                glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
@@ -313,6 +333,9 @@ int main(int argc, char **argv)
                glDispatchCompute(4, 1, 1);
                glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
        
+               glCopyNamedBufferSubData(dist_ssbo, dist_ubo, 0, 0, sizeof(RansDistUBO));
+               glMemoryBarrier(GL_UNIFORM_BARRIER_BIT);
+
                glUseProgram(glsl_rans_program_num);
                glDispatchCompute(NUM_BLOCKS / BLOCKS_PER_STREAM, 8, 5);
        }
@@ -340,20 +363,21 @@ int main(int argc, char **argv)
        }
 
        // Write out the distributions.
-       const RansDistSSBO *rans_dist = (const RansDistSSBO *)glMapNamedBufferRange(ssbo, 0, sizeof(RansDistSSBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       const RansCountSSBO *rans_count = (const RansCountSSBO *)glMapNamedBufferRange(ssbo, 0, sizeof(RansCountSSBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       const RansDistUBO *rans_dist = (const RansDistUBO *)glMapNamedBufferRange(dist_ssbo, 0, sizeof(RansDistUBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
        for (unsigned r = 0; r < 2; ++r) {  // Hack to write fake chroma tables.
                // TODO: rather gamma-k or something
                for (unsigned i = 0; i < 4; ++i) {
                        printf("writing table %d\n", i);
                        for (unsigned j = 0; j < NUM_SYMS; ++j) {
                                printf("%d,%d: freq=%d  x_max=%d, rcp_freq=%08x, bias=%d, rcp_shift=%d, cmpl_freq=%d\n",
-                                       i, j, rans_dist->ransfreq[i * 256 + j],
+                                       i, j, rans_count->ransfreq[i * 256 + j],
                                        rans_dist->ransdist[i * 256 + j].x_max,
                                        rans_dist->ransdist[i * 256 + j].rcp_freq,
                                        rans_dist->ransdist[i * 256 + j].bias,
                                        rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq & 0xffff,
                                        rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq >> 16);
-                               write_varint(rans_dist->ransfreq[i * 256 + j], codedfp);
+                               write_varint(rans_count->ransfreq[i * 256 + j], codedfp);
                        }
                }
        }
index b677415952f89ef67f37c69926a71759357a229b..c0272312118a4a00a00a9c0ce6197fb8616691dc 100644 (file)
@@ -24,14 +24,6 @@ const uint luma_mapping[8] = {
        MAPPING(3, 3, 3, 3, 3, 3, 3, 3),
 };
 
-layout(std430, binding = 9) buffer layoutName
-{
-       uint dist[4 * 256];
-       uint ransfreq[4 * 256];
-       uvec4 ransdist[4 * 256];
-       uint sign_biases[4];
-};
-
 layout(std430, binding = 10) buffer outputBuf
 {
        uint8_t rans_output[];
@@ -42,6 +34,12 @@ layout(std430, binding = 11) buffer outputBuf2
        uint rans_start_offset[];
 };
 
+layout(std140, binding = 13) uniform DistBlock
+{
+       uvec4 ransdist[4 * 256];
+       uint sign_biases[4];
+};
+
 struct RansEncoder {
        uint stream_num;   // const
        uint lut_base;     // const
index 351d3fb54e7aeb22dad59ec10019199382799663..e0fb9431432493721de3d436bc57c383b3575514 100644 (file)
@@ -9,6 +9,10 @@ layout(std430, binding = 9) buffer layoutName
 {
        uint dist[4 * 256];
        uint ransfreq[4 * 256];
+};
+
+layout(std140, binding = 12) buffer distBlock  // Will become an UBO to rans.shader, thus layout std140.
+{
        uvec4 ransdist[4 * 256];
        uint sign_biases[4];
 };