X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=narabu-encoder.cpp;h=f2dd7c29ff25e54e8e00c0b09d2fbfe41c8896f7;hb=c6928acb2405c30fdaf1a5d9efeb2902d7c544aa;hp=6b1df4837ef5774bcb0a1c78d9a812f7ef70afbe;hpb=5e1d27014149311318e97b8e04a6e05ec858e57c;p=narabu diff --git a/narabu-encoder.cpp b/narabu-encoder.cpp index 6b1df48..f2dd7c2 100644 --- a/narabu-encoder.cpp +++ b/narabu-encoder.cpp @@ -19,8 +19,6 @@ #include -#include "ryg_rans/rans_byte.h" -#include "ryg_rans/renormalize.h" #include "util.h" #define WIDTH 1280 @@ -34,6 +32,9 @@ #define NUM_SYMS 256 #define ESCAPE_LIMIT (NUM_SYMS - 1) #define BLOCKS_PER_STREAM 320 +#define STREAM_BUF_SIZE 1024 // In bytes. + +#define NUM_STREAMS ((NUM_BLOCKS + BLOCKS_PER_STREAM - 1) / BLOCKS_PER_STREAM) static constexpr uint32_t prob_bits = 12; static constexpr uint32_t prob_scale = 1 << prob_bits; @@ -43,9 +44,19 @@ unsigned char pix_y[WIDTH * HEIGHT]; unsigned char pix_cb[(WIDTH/2) * HEIGHT]; unsigned char pix_cr[(WIDTH/2) * HEIGHT]; -struct RansDistSSBO { +struct RansCountSSBO { unsigned dist[4 * 256]; - std::pair ransdist[4 * 256]; + unsigned ransfreq[4 * 256]; +}; + +struct RansDistUBO { + struct { + uint32_t x_max, rcp_freq, bias, rcp_shift_and_cmpl_freq; + } ransdist[4 * 256]; + struct { + uint32_t val; + uint32_t padding[3]; // std140 layout. + } sign_biases[4]; }; using namespace std; @@ -192,24 +203,37 @@ int main(int argc, char **argv) } check_error(); - // An SSBO for the rANS distributions. + // An SSBO for the raw rANS counts. GLuint ssbo; glGenBuffers(1, &ssbo); glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo); - glNamedBufferStorage(ssbo, 256 * 16 * sizeof(uint32_t), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + glNamedBufferStorage(ssbo, sizeof(RansCountSSBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + check_error(); + + // UBO for the rANS distributions (copied from an SSBO). + GLuint dist_ssbo; + glGenBuffers(1, &dist_ssbo); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, dist_ssbo); + glNamedBufferStorage(dist_ssbo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + check_error(); + + GLuint dist_ubo; + glGenBuffers(1, &dist_ubo); + glBindBuffer(GL_UNIFORM_BUFFER, dist_ubo); + glNamedBufferStorage(dist_ubo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); check_error(); // SSBOs for the rANS output (data and offsets). GLuint output_ssbo; glGenBuffers(1, &output_ssbo); glBindBuffer(GL_SHADER_STORAGE_BUFFER, output_ssbo); - glNamedBufferStorage(output_ssbo, 45 * 64 * 1024, nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + glNamedBufferStorage(output_ssbo, 64 * NUM_STREAMS * STREAM_BUF_SIZE, nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); check_error(); - GLuint output_offset_ssbo; - glGenBuffers(1, &output_offset_ssbo); - glBindBuffer(GL_SHADER_STORAGE_BUFFER, output_offset_ssbo); - glNamedBufferStorage(output_offset_ssbo, 45 * 64 * sizeof(uint32_t), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + GLuint bytes_written_ssbo; + glGenBuffers(1, &bytes_written_ssbo); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, bytes_written_ssbo); + glNamedBufferStorage(bytes_written_ssbo, 64 * NUM_STREAMS * sizeof(uint32_t), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); check_error(); // Bind SSBOs. @@ -218,11 +242,12 @@ int main(int argc, char **argv) glUseProgram(glsl_tally_program_num); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 12, dist_ssbo); glUseProgram(glsl_rans_program_num); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, output_ssbo); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, output_offset_ssbo); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, bytes_written_ssbo); + glBindBufferBase(GL_UNIFORM_BUFFER, 13, dist_ubo); glUseProgram(glsl_program_num); check_error(); @@ -263,11 +288,11 @@ int main(int argc, char **argv) check_error(); } - glBindImageTexture(0, dc_ac7_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI); - glBindImageTexture(1, ac1_ac6_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI); - glBindImageTexture(2, ac2_ac5_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI); - glBindImageTexture(3, ac3_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I); - glBindImageTexture(4, ac4_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I); + glBindImageTexture(0, dc_ac7_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R16UI); + glBindImageTexture(1, ac1_ac6_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R16UI); + glBindImageTexture(2, ac2_ac5_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R16UI); + glBindImageTexture(3, ac3_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R8I); + glBindImageTexture(4, ac4_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R8I); glBindImageTexture(5, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI); check_error(); @@ -302,7 +327,7 @@ int main(int argc, char **argv) steady_clock::time_point start = steady_clock::now(); unsigned num_iterations = 100; for (unsigned i = 0; i < num_iterations; ++i) { - glClearNamedBufferSubData(ssbo, GL_R8, 0, 256 * 16 * sizeof(uint32_t), GL_RED, GL_UNSIGNED_BYTE, nullptr); + glClearNamedBufferSubData(ssbo, GL_R8, 0, sizeof(RansCountSSBO), GL_RED, GL_UNSIGNED_BYTE, nullptr); glUseProgram(glsl_program_num); glDispatchCompute(WIDTH_BLOCKS / 16, HEIGHT_BLOCKS, 1); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); @@ -311,8 +336,11 @@ int main(int argc, char **argv) glDispatchCompute(4, 1, 1); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + glCopyNamedBufferSubData(dist_ssbo, dist_ubo, 0, 0, sizeof(RansDistUBO)); + glMemoryBarrier(GL_UNIFORM_BARRIER_BIT); + glUseProgram(glsl_rans_program_num); - glDispatchCompute(NUM_BLOCKS / BLOCKS_PER_STREAM, 8, 5); + glDispatchCompute(NUM_STREAMS, 8, 5); } check_error(); glFinish(); @@ -338,49 +366,53 @@ int main(int argc, char **argv) } // Write out the distributions. - const RansDistSSBO *rans_dist = (const RansDistSSBO *)glMapNamedBufferRange(ssbo, 0, 256 * 16 * sizeof(uint32_t), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + const RansCountSSBO *rans_count = (const RansCountSSBO *)glMapNamedBufferRange(ssbo, 0, sizeof(RansCountSSBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + const RansDistUBO *rans_dist = (const RansDistUBO *)glMapNamedBufferRange(dist_ssbo, 0, sizeof(RansDistUBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); for (unsigned r = 0; r < 2; ++r) { // Hack to write fake chroma tables. // TODO: rather gamma-k or something for (unsigned i = 0; i < 4; ++i) { printf("writing table %d\n", i); for (unsigned j = 0; j < NUM_SYMS; ++j) { - printf("%d,%d: %d\n", i, j, rans_dist->ransdist[i * 256 + j].first); - write_varint(rans_dist->ransdist[i * 256 + j].first, codedfp); + printf("%d,%d: freq=%d x_max=%d, rcp_freq=%08x, bias=%d, rcp_shift=%d, cmpl_freq=%d\n", + i, j, rans_count->ransfreq[i * 256 + j], + rans_dist->ransdist[i * 256 + j].x_max, + rans_dist->ransdist[i * 256 + j].rcp_freq, + rans_dist->ransdist[i * 256 + j].bias, + rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq & 0xffff, + rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq >> 16); + write_varint(rans_count->ransfreq[i * 256 + j], codedfp); } } } // Write out the actual data. - // TODO: Do the deduplication. - const uint32_t *offsets = (const uint32_t *)glMapNamedBufferRange(output_offset_ssbo, 0, 45 * 64 * sizeof(uint32_t), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + const uint32_t *bytes_written = (const uint32_t *)glMapNamedBufferRange(bytes_written_ssbo, 0, 64 * NUM_STREAMS * sizeof(uint32_t), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); #if 0 - for (int i = 0; i < 45*64; ++i) { + for (int i = 0; i < HEIGHT_BLOCKS*64; ++i) { printf("%d,%d,%d: %u\n", i / 64, (i / 8) % 8, i % 8, 1024 * (i + 1) - offsets[i]); } #endif - const uint8_t *data = (const uint8_t *)glMapNamedBufferRange(output_ssbo, 0, 45 * 64 * 1024, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + const uint8_t *data = (const uint8_t *)glMapNamedBufferRange(output_ssbo, 0, 64 * NUM_STREAMS * STREAM_BUF_SIZE, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + string last_block; for (unsigned y = 0; y < 8; ++y) { for (unsigned x = 0; x < 8; ++x) { - for (unsigned int stream_idx = 0; stream_idx < 45; ++stream_idx) { - const uint8_t *out_end = data + (stream_idx * 64 + y * 8 + x + 1) * 1024; - const uint8_t *ptr = data + offsets[stream_idx * 64 + y * 8 + x]; - uint32_t num_rans_bytes = out_end - ptr; -#if 0 + for (unsigned int stream_idx = 0; stream_idx < NUM_STREAMS; ++stream_idx) { + const uint8_t *out_end = data + (stream_idx * 64 + y * 8 + x + 1) * STREAM_BUF_SIZE; + uint32_t num_rans_bytes = bytes_written[stream_idx * 64 + y * 8 + x]; + const uint8_t *ptr = out_end - num_rans_bytes; + assert(num_rans_bytes <= STREAM_BUF_SIZE); + if (num_rans_bytes == last_block.size() && memcmp(last_block.data(), ptr, last_block.size()) == 0) { write_varint(0, codedfp); - clear(); - return 1; } else { last_block = string((const char *)ptr, num_rans_bytes); + write_varint(num_rans_bytes, codedfp); + fwrite(ptr, 1, num_rans_bytes, codedfp); } -#endif - - write_varint(num_rans_bytes, codedfp); - fwrite(ptr, 1, num_rans_bytes, codedfp); } } }