X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=narabu-encoder.cpp;h=36804d616f70681f411af412961a23ac87a542b4;hb=8ec3c0a5d091591d2961a64f4a8cdf3600111fa8;hp=6b1df4837ef5774bcb0a1c78d9a812f7ef70afbe;hpb=5e1d27014149311318e97b8e04a6e05ec858e57c;p=narabu diff --git a/narabu-encoder.cpp b/narabu-encoder.cpp index 6b1df48..36804d6 100644 --- a/narabu-encoder.cpp +++ b/narabu-encoder.cpp @@ -19,8 +19,6 @@ #include -#include "ryg_rans/rans_byte.h" -#include "ryg_rans/renormalize.h" #include "util.h" #define WIDTH 1280 @@ -43,9 +41,19 @@ unsigned char pix_y[WIDTH * HEIGHT]; unsigned char pix_cb[(WIDTH/2) * HEIGHT]; unsigned char pix_cr[(WIDTH/2) * HEIGHT]; -struct RansDistSSBO { +struct RansCountSSBO { unsigned dist[4 * 256]; - std::pair ransdist[4 * 256]; + unsigned ransfreq[4 * 256]; +}; + +struct RansDistUBO { + struct { + uint32_t x_max, rcp_freq, bias, rcp_shift_and_cmpl_freq; + } ransdist[4 * 256]; + struct { + uint32_t val; + uint32_t padding[3]; // std140 layout. + } sign_biases[4]; }; using namespace std; @@ -192,11 +200,24 @@ int main(int argc, char **argv) } check_error(); - // An SSBO for the rANS distributions. + // An SSBO for the raw rANS counts. GLuint ssbo; glGenBuffers(1, &ssbo); glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo); - glNamedBufferStorage(ssbo, 256 * 16 * sizeof(uint32_t), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + glNamedBufferStorage(ssbo, sizeof(RansCountSSBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + check_error(); + + // UBO for the rANS distributions (copied from an SSBO). + GLuint dist_ssbo; + glGenBuffers(1, &dist_ssbo); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, dist_ssbo); + glNamedBufferStorage(dist_ssbo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + check_error(); + + GLuint dist_ubo; + glGenBuffers(1, &dist_ubo); + glBindBuffer(GL_UNIFORM_BUFFER, dist_ubo); + glNamedBufferStorage(dist_ubo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); check_error(); // SSBOs for the rANS output (data and offsets). @@ -218,11 +239,12 @@ int main(int argc, char **argv) glUseProgram(glsl_tally_program_num); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 12, dist_ssbo); glUseProgram(glsl_rans_program_num); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, output_ssbo); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, output_offset_ssbo); + glBindBufferBase(GL_UNIFORM_BUFFER, 13, dist_ubo); glUseProgram(glsl_program_num); check_error(); @@ -263,11 +285,11 @@ int main(int argc, char **argv) check_error(); } - glBindImageTexture(0, dc_ac7_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI); - glBindImageTexture(1, ac1_ac6_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI); - glBindImageTexture(2, ac2_ac5_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI); - glBindImageTexture(3, ac3_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I); - glBindImageTexture(4, ac4_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I); + glBindImageTexture(0, dc_ac7_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R16UI); + glBindImageTexture(1, ac1_ac6_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R16UI); + glBindImageTexture(2, ac2_ac5_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R16UI); + glBindImageTexture(3, ac3_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R8I); + glBindImageTexture(4, ac4_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R8I); glBindImageTexture(5, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI); check_error(); @@ -302,7 +324,7 @@ int main(int argc, char **argv) steady_clock::time_point start = steady_clock::now(); unsigned num_iterations = 100; for (unsigned i = 0; i < num_iterations; ++i) { - glClearNamedBufferSubData(ssbo, GL_R8, 0, 256 * 16 * sizeof(uint32_t), GL_RED, GL_UNSIGNED_BYTE, nullptr); + glClearNamedBufferSubData(ssbo, GL_R8, 0, sizeof(RansCountSSBO), GL_RED, GL_UNSIGNED_BYTE, nullptr); glUseProgram(glsl_program_num); glDispatchCompute(WIDTH_BLOCKS / 16, HEIGHT_BLOCKS, 1); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); @@ -311,6 +333,9 @@ int main(int argc, char **argv) glDispatchCompute(4, 1, 1); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + glCopyNamedBufferSubData(dist_ssbo, dist_ubo, 0, 0, sizeof(RansDistUBO)); + glMemoryBarrier(GL_UNIFORM_BARRIER_BIT); + glUseProgram(glsl_rans_program_num); glDispatchCompute(NUM_BLOCKS / BLOCKS_PER_STREAM, 8, 5); } @@ -338,14 +363,21 @@ int main(int argc, char **argv) } // Write out the distributions. - const RansDistSSBO *rans_dist = (const RansDistSSBO *)glMapNamedBufferRange(ssbo, 0, 256 * 16 * sizeof(uint32_t), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + const RansCountSSBO *rans_count = (const RansCountSSBO *)glMapNamedBufferRange(ssbo, 0, sizeof(RansCountSSBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + const RansDistUBO *rans_dist = (const RansDistUBO *)glMapNamedBufferRange(dist_ssbo, 0, sizeof(RansDistUBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); for (unsigned r = 0; r < 2; ++r) { // Hack to write fake chroma tables. // TODO: rather gamma-k or something for (unsigned i = 0; i < 4; ++i) { printf("writing table %d\n", i); for (unsigned j = 0; j < NUM_SYMS; ++j) { - printf("%d,%d: %d\n", i, j, rans_dist->ransdist[i * 256 + j].first); - write_varint(rans_dist->ransdist[i * 256 + j].first, codedfp); + printf("%d,%d: freq=%d x_max=%d, rcp_freq=%08x, bias=%d, rcp_shift=%d, cmpl_freq=%d\n", + i, j, rans_count->ransfreq[i * 256 + j], + rans_dist->ransdist[i * 256 + j].x_max, + rans_dist->ransdist[i * 256 + j].rcp_freq, + rans_dist->ransdist[i * 256 + j].bias, + rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq & 0xffff, + rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq >> 16); + write_varint(rans_count->ransfreq[i * 256 + j], codedfp); } } } @@ -362,25 +394,23 @@ int main(int argc, char **argv) const uint8_t *data = (const uint8_t *)glMapNamedBufferRange(output_ssbo, 0, 45 * 64 * 1024, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + string last_block; for (unsigned y = 0; y < 8; ++y) { for (unsigned x = 0; x < 8; ++x) { for (unsigned int stream_idx = 0; stream_idx < 45; ++stream_idx) { const uint8_t *out_end = data + (stream_idx * 64 + y * 8 + x + 1) * 1024; const uint8_t *ptr = data + offsets[stream_idx * 64 + y * 8 + x]; uint32_t num_rans_bytes = out_end - ptr; -#if 0 + assert(num_rans_bytes <= 1024); + if (num_rans_bytes == last_block.size() && memcmp(last_block.data(), ptr, last_block.size()) == 0) { write_varint(0, codedfp); - clear(); - return 1; } else { last_block = string((const char *)ptr, num_rans_bytes); + write_varint(num_rans_bytes, codedfp); + fwrite(ptr, 1, num_rans_bytes, codedfp); } -#endif - - write_varint(num_rans_bytes, codedfp); - fwrite(ptr, 1, num_rans_bytes, codedfp); } } }