From 88ff4031f4d927b95106b8462ec4bd3c9edef718 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Tue, 17 Oct 2017 22:33:54 +0200 Subject: [PATCH] Pull the rANS distributions into uniforms instead of SSBOs. Speeds up stuff a bit. --- narabu-encoder.cpp | 42 +++++++++++++++++++++++++++++++++--------- rans.shader | 14 ++++++-------- tally.shader | 4 ++++ 3 files changed, 43 insertions(+), 17 deletions(-) diff --git a/narabu-encoder.cpp b/narabu-encoder.cpp index ca28528..b53cfc0 100644 --- a/narabu-encoder.cpp +++ b/narabu-encoder.cpp @@ -41,13 +41,19 @@ unsigned char pix_y[WIDTH * HEIGHT]; unsigned char pix_cb[(WIDTH/2) * HEIGHT]; unsigned char pix_cr[(WIDTH/2) * HEIGHT]; -struct RansDistSSBO { +struct RansCountSSBO { unsigned dist[4 * 256]; unsigned ransfreq[4 * 256]; +}; + +struct RansDistUBO { struct { uint32_t x_max, rcp_freq, bias, rcp_shift_and_cmpl_freq; } ransdist[4 * 256]; - unsigned sign_biases[4]; + struct { + uint32_t val; + uint32_t padding[3]; // std140 layout. + } sign_biases[4]; }; using namespace std; @@ -194,11 +200,24 @@ int main(int argc, char **argv) } check_error(); - // An SSBO for the rANS distributions. + // An SSBO for the raw rANS counts. GLuint ssbo; glGenBuffers(1, &ssbo); glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo); - glNamedBufferStorage(ssbo, sizeof(RansDistSSBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + glNamedBufferStorage(ssbo, sizeof(RansCountSSBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + check_error(); + + // UBO for the rANS distributions (copied from an SSBO). + GLuint dist_ssbo; + glGenBuffers(1, &dist_ssbo); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, dist_ssbo); + glNamedBufferStorage(dist_ssbo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + check_error(); + + GLuint dist_ubo; + glGenBuffers(1, &dist_ubo); + glBindBuffer(GL_UNIFORM_BUFFER, dist_ubo); + glNamedBufferStorage(dist_ubo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); check_error(); // SSBOs for the rANS output (data and offsets). @@ -220,11 +239,12 @@ int main(int argc, char **argv) glUseProgram(glsl_tally_program_num); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 12, dist_ssbo); glUseProgram(glsl_rans_program_num); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, output_ssbo); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, output_offset_ssbo); + glBindBufferBase(GL_UNIFORM_BUFFER, 13, dist_ubo); glUseProgram(glsl_program_num); check_error(); @@ -304,7 +324,7 @@ int main(int argc, char **argv) steady_clock::time_point start = steady_clock::now(); unsigned num_iterations = 100; for (unsigned i = 0; i < num_iterations; ++i) { - glClearNamedBufferSubData(ssbo, GL_R8, 0, sizeof(RansDistSSBO), GL_RED, GL_UNSIGNED_BYTE, nullptr); + glClearNamedBufferSubData(ssbo, GL_R8, 0, sizeof(RansCountSSBO), GL_RED, GL_UNSIGNED_BYTE, nullptr); glUseProgram(glsl_program_num); glDispatchCompute(WIDTH_BLOCKS / 16, HEIGHT_BLOCKS, 1); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); @@ -313,6 +333,9 @@ int main(int argc, char **argv) glDispatchCompute(4, 1, 1); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + glCopyNamedBufferSubData(dist_ssbo, dist_ubo, 0, 0, sizeof(RansDistUBO)); + glMemoryBarrier(GL_UNIFORM_BARRIER_BIT); + glUseProgram(glsl_rans_program_num); glDispatchCompute(NUM_BLOCKS / BLOCKS_PER_STREAM, 8, 5); } @@ -340,20 +363,21 @@ int main(int argc, char **argv) } // Write out the distributions. - const RansDistSSBO *rans_dist = (const RansDistSSBO *)glMapNamedBufferRange(ssbo, 0, sizeof(RansDistSSBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + const RansCountSSBO *rans_count = (const RansCountSSBO *)glMapNamedBufferRange(ssbo, 0, sizeof(RansCountSSBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + const RansDistUBO *rans_dist = (const RansDistUBO *)glMapNamedBufferRange(dist_ssbo, 0, sizeof(RansDistUBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); for (unsigned r = 0; r < 2; ++r) { // Hack to write fake chroma tables. // TODO: rather gamma-k or something for (unsigned i = 0; i < 4; ++i) { printf("writing table %d\n", i); for (unsigned j = 0; j < NUM_SYMS; ++j) { printf("%d,%d: freq=%d x_max=%d, rcp_freq=%08x, bias=%d, rcp_shift=%d, cmpl_freq=%d\n", - i, j, rans_dist->ransfreq[i * 256 + j], + i, j, rans_count->ransfreq[i * 256 + j], rans_dist->ransdist[i * 256 + j].x_max, rans_dist->ransdist[i * 256 + j].rcp_freq, rans_dist->ransdist[i * 256 + j].bias, rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq & 0xffff, rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq >> 16); - write_varint(rans_dist->ransfreq[i * 256 + j], codedfp); + write_varint(rans_count->ransfreq[i * 256 + j], codedfp); } } } diff --git a/rans.shader b/rans.shader index b677415..c027231 100644 --- a/rans.shader +++ b/rans.shader @@ -24,14 +24,6 @@ const uint luma_mapping[8] = { MAPPING(3, 3, 3, 3, 3, 3, 3, 3), }; -layout(std430, binding = 9) buffer layoutName -{ - uint dist[4 * 256]; - uint ransfreq[4 * 256]; - uvec4 ransdist[4 * 256]; - uint sign_biases[4]; -}; - layout(std430, binding = 10) buffer outputBuf { uint8_t rans_output[]; @@ -42,6 +34,12 @@ layout(std430, binding = 11) buffer outputBuf2 uint rans_start_offset[]; }; +layout(std140, binding = 13) uniform DistBlock +{ + uvec4 ransdist[4 * 256]; + uint sign_biases[4]; +}; + struct RansEncoder { uint stream_num; // const uint lut_base; // const diff --git a/tally.shader b/tally.shader index 351d3fb..e0fb943 100644 --- a/tally.shader +++ b/tally.shader @@ -9,6 +9,10 @@ layout(std430, binding = 9) buffer layoutName { uint dist[4 * 256]; uint ransfreq[4 * 256]; +}; + +layout(std140, binding = 12) buffer distBlock // Will become an UBO to rans.shader, thus layout std140. +{ uvec4 ransdist[4 * 256]; uint sign_biases[4]; }; -- 2.39.2