X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=narabu-encoder.cpp;h=fe270641634652debd37edba61470d585da5fbf8;hb=0f84eb2a5fa77dbc20f7a073f83bdb674ebb7b75;hp=ca285282110480de405eadd04acf495898d2be6c;hpb=f3eac72e679d3e0ec7ae1d4484736cd552c344dd;p=narabu diff --git a/narabu-encoder.cpp b/narabu-encoder.cpp index ca28528..fe27064 100644 --- a/narabu-encoder.cpp +++ b/narabu-encoder.cpp @@ -41,13 +41,19 @@ unsigned char pix_y[WIDTH * HEIGHT]; unsigned char pix_cb[(WIDTH/2) * HEIGHT]; unsigned char pix_cr[(WIDTH/2) * HEIGHT]; -struct RansDistSSBO { +struct RansCountSSBO { unsigned dist[4 * 256]; unsigned ransfreq[4 * 256]; +}; + +struct RansDistUBO { struct { uint32_t x_max, rcp_freq, bias, rcp_shift_and_cmpl_freq; } ransdist[4 * 256]; - unsigned sign_biases[4]; + struct { + uint32_t val; + uint32_t padding[3]; // std140 layout. + } sign_biases[4]; }; using namespace std; @@ -194,11 +200,24 @@ int main(int argc, char **argv) } check_error(); - // An SSBO for the rANS distributions. + // An SSBO for the raw rANS counts. GLuint ssbo; glGenBuffers(1, &ssbo); glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo); - glNamedBufferStorage(ssbo, sizeof(RansDistSSBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + glNamedBufferStorage(ssbo, sizeof(RansCountSSBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + check_error(); + + // UBO for the rANS distributions (copied from an SSBO). + GLuint dist_ssbo; + glGenBuffers(1, &dist_ssbo); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, dist_ssbo); + glNamedBufferStorage(dist_ssbo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + check_error(); + + GLuint dist_ubo; + glGenBuffers(1, &dist_ubo); + glBindBuffer(GL_UNIFORM_BUFFER, dist_ubo); + glNamedBufferStorage(dist_ubo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); check_error(); // SSBOs for the rANS output (data and offsets). @@ -208,10 +227,10 @@ int main(int argc, char **argv) glNamedBufferStorage(output_ssbo, 45 * 64 * 1024, nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); check_error(); - GLuint output_offset_ssbo; - glGenBuffers(1, &output_offset_ssbo); - glBindBuffer(GL_SHADER_STORAGE_BUFFER, output_offset_ssbo); - glNamedBufferStorage(output_offset_ssbo, 45 * 64 * sizeof(uint32_t), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + GLuint bytes_written_ssbo; + glGenBuffers(1, &bytes_written_ssbo); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, bytes_written_ssbo); + glNamedBufferStorage(bytes_written_ssbo, 45 * 64 * sizeof(uint32_t), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); check_error(); // Bind SSBOs. @@ -220,11 +239,12 @@ int main(int argc, char **argv) glUseProgram(glsl_tally_program_num); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 12, dist_ssbo); glUseProgram(glsl_rans_program_num); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, output_ssbo); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, output_offset_ssbo); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, bytes_written_ssbo); + glBindBufferBase(GL_UNIFORM_BUFFER, 13, dist_ubo); glUseProgram(glsl_program_num); check_error(); @@ -304,7 +324,7 @@ int main(int argc, char **argv) steady_clock::time_point start = steady_clock::now(); unsigned num_iterations = 100; for (unsigned i = 0; i < num_iterations; ++i) { - glClearNamedBufferSubData(ssbo, GL_R8, 0, sizeof(RansDistSSBO), GL_RED, GL_UNSIGNED_BYTE, nullptr); + glClearNamedBufferSubData(ssbo, GL_R8, 0, sizeof(RansCountSSBO), GL_RED, GL_UNSIGNED_BYTE, nullptr); glUseProgram(glsl_program_num); glDispatchCompute(WIDTH_BLOCKS / 16, HEIGHT_BLOCKS, 1); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); @@ -313,6 +333,9 @@ int main(int argc, char **argv) glDispatchCompute(4, 1, 1); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + glCopyNamedBufferSubData(dist_ssbo, dist_ubo, 0, 0, sizeof(RansDistUBO)); + glMemoryBarrier(GL_UNIFORM_BARRIER_BIT); + glUseProgram(glsl_rans_program_num); glDispatchCompute(NUM_BLOCKS / BLOCKS_PER_STREAM, 8, 5); } @@ -340,28 +363,28 @@ int main(int argc, char **argv) } // Write out the distributions. - const RansDistSSBO *rans_dist = (const RansDistSSBO *)glMapNamedBufferRange(ssbo, 0, sizeof(RansDistSSBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + const RansCountSSBO *rans_count = (const RansCountSSBO *)glMapNamedBufferRange(ssbo, 0, sizeof(RansCountSSBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + const RansDistUBO *rans_dist = (const RansDistUBO *)glMapNamedBufferRange(dist_ssbo, 0, sizeof(RansDistUBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); for (unsigned r = 0; r < 2; ++r) { // Hack to write fake chroma tables. // TODO: rather gamma-k or something for (unsigned i = 0; i < 4; ++i) { printf("writing table %d\n", i); for (unsigned j = 0; j < NUM_SYMS; ++j) { printf("%d,%d: freq=%d x_max=%d, rcp_freq=%08x, bias=%d, rcp_shift=%d, cmpl_freq=%d\n", - i, j, rans_dist->ransfreq[i * 256 + j], + i, j, rans_count->ransfreq[i * 256 + j], rans_dist->ransdist[i * 256 + j].x_max, rans_dist->ransdist[i * 256 + j].rcp_freq, rans_dist->ransdist[i * 256 + j].bias, rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq & 0xffff, rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq >> 16); - write_varint(rans_dist->ransfreq[i * 256 + j], codedfp); + write_varint(rans_count->ransfreq[i * 256 + j], codedfp); } } } // Write out the actual data. - // TODO: Do the deduplication. - const uint32_t *offsets = (const uint32_t *)glMapNamedBufferRange(output_offset_ssbo, 0, 45 * 64 * sizeof(uint32_t), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + const uint32_t *bytes_written = (const uint32_t *)glMapNamedBufferRange(bytes_written_ssbo, 0, 45 * 64 * sizeof(uint32_t), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); #if 0 for (int i = 0; i < 45*64; ++i) { printf("%d,%d,%d: %u\n", i / 64, (i / 8) % 8, i % 8, 1024 * (i + 1) - offsets[i]); @@ -370,25 +393,23 @@ int main(int argc, char **argv) const uint8_t *data = (const uint8_t *)glMapNamedBufferRange(output_ssbo, 0, 45 * 64 * 1024, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + string last_block; for (unsigned y = 0; y < 8; ++y) { for (unsigned x = 0; x < 8; ++x) { for (unsigned int stream_idx = 0; stream_idx < 45; ++stream_idx) { const uint8_t *out_end = data + (stream_idx * 64 + y * 8 + x + 1) * 1024; - const uint8_t *ptr = data + offsets[stream_idx * 64 + y * 8 + x]; - uint32_t num_rans_bytes = out_end - ptr; -#if 0 + uint32_t num_rans_bytes = bytes_written[stream_idx * 64 + y * 8 + x]; + const uint8_t *ptr = out_end - num_rans_bytes; + assert(num_rans_bytes <= 1024); + if (num_rans_bytes == last_block.size() && memcmp(last_block.data(), ptr, last_block.size()) == 0) { write_varint(0, codedfp); - clear(); - return 1; } else { last_block = string((const char *)ptr, num_rans_bytes); + write_varint(num_rans_bytes, codedfp); + fwrite(ptr, 1, num_rans_bytes, codedfp); } -#endif - - write_varint(num_rans_bytes, codedfp); - fwrite(ptr, 1, num_rans_bytes, codedfp); } } }