]> git.sesse.net Git - narabu/blobdiff - narabu-encoder.cpp
More fixes of hard-coded values.
[narabu] / narabu-encoder.cpp
index d13f0446516a73d3a68b333c35074d00ed032b19..f2dd7c29ff25e54e8e00c0b09d2fbfe41c8896f7 100644 (file)
@@ -32,6 +32,9 @@
 #define NUM_SYMS 256
 #define ESCAPE_LIMIT (NUM_SYMS - 1)
 #define BLOCKS_PER_STREAM 320
+#define STREAM_BUF_SIZE 1024  // In bytes.
+
+#define NUM_STREAMS ((NUM_BLOCKS + BLOCKS_PER_STREAM - 1) / BLOCKS_PER_STREAM)
 
 static constexpr uint32_t prob_bits = 12;
 static constexpr uint32_t prob_scale = 1 << prob_bits;
@@ -41,9 +44,19 @@ unsigned char pix_y[WIDTH * HEIGHT];
 unsigned char pix_cb[(WIDTH/2) * HEIGHT];
 unsigned char pix_cr[(WIDTH/2) * HEIGHT];
 
-struct RansDistSSBO {
+struct RansCountSSBO {
        unsigned dist[4 * 256];
-       std::pair<unsigned, unsigned> ransdist[4 * 256];
+       unsigned ransfreq[4 * 256];
+};
+
+struct RansDistUBO {
+       struct {
+               uint32_t x_max, rcp_freq, bias, rcp_shift_and_cmpl_freq;
+       } ransdist[4 * 256];
+       struct {
+               uint32_t val;
+               uint32_t padding[3];  // std140 layout.
+       } sign_biases[4];
 };
 
 using namespace std;
@@ -190,24 +203,37 @@ int main(int argc, char **argv)
        }
        check_error();
 
-       // An SSBO for the rANS distributions.
+       // An SSBO for the raw rANS counts.
        GLuint ssbo;
        glGenBuffers(1, &ssbo);
        glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
-       glNamedBufferStorage(ssbo, 256 * 16 * sizeof(uint32_t), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       glNamedBufferStorage(ssbo, sizeof(RansCountSSBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       check_error();
+
+       // UBO for the rANS distributions (copied from an SSBO).
+       GLuint dist_ssbo;
+       glGenBuffers(1, &dist_ssbo);
+       glBindBuffer(GL_SHADER_STORAGE_BUFFER, dist_ssbo);
+       glNamedBufferStorage(dist_ssbo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       check_error();
+
+       GLuint dist_ubo;
+       glGenBuffers(1, &dist_ubo);
+       glBindBuffer(GL_UNIFORM_BUFFER, dist_ubo);
+       glNamedBufferStorage(dist_ubo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
        check_error();
 
        // SSBOs for the rANS output (data and offsets).
        GLuint output_ssbo;
        glGenBuffers(1, &output_ssbo);
        glBindBuffer(GL_SHADER_STORAGE_BUFFER, output_ssbo);
-       glNamedBufferStorage(output_ssbo, 45 * 64 * 1024, nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       glNamedBufferStorage(output_ssbo, 64 * NUM_STREAMS * STREAM_BUF_SIZE, nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
        check_error();
 
-       GLuint output_offset_ssbo;
-       glGenBuffers(1, &output_offset_ssbo);
-       glBindBuffer(GL_SHADER_STORAGE_BUFFER, output_offset_ssbo);
-       glNamedBufferStorage(output_offset_ssbo, 45 * 64 * sizeof(uint32_t), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       GLuint bytes_written_ssbo;
+       glGenBuffers(1, &bytes_written_ssbo);
+       glBindBuffer(GL_SHADER_STORAGE_BUFFER, bytes_written_ssbo);
+       glNamedBufferStorage(bytes_written_ssbo, 64 * NUM_STREAMS * sizeof(uint32_t), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
        check_error();
 
        // Bind SSBOs.
@@ -216,11 +242,12 @@ int main(int argc, char **argv)
 
        glUseProgram(glsl_tally_program_num);
        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
+       glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 12, dist_ssbo);
 
        glUseProgram(glsl_rans_program_num);
-       glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, output_ssbo);
-       glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, output_offset_ssbo);
+       glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, bytes_written_ssbo);
+       glBindBufferBase(GL_UNIFORM_BUFFER, 13, dist_ubo);
 
        glUseProgram(glsl_program_num);
        check_error();
@@ -300,7 +327,7 @@ int main(int argc, char **argv)
        steady_clock::time_point start = steady_clock::now();
        unsigned num_iterations = 100;
        for (unsigned i = 0; i < num_iterations; ++i) {
-               glClearNamedBufferSubData(ssbo, GL_R8, 0, 256 * 16 * sizeof(uint32_t), GL_RED, GL_UNSIGNED_BYTE, nullptr);
+               glClearNamedBufferSubData(ssbo, GL_R8, 0, sizeof(RansCountSSBO), GL_RED, GL_UNSIGNED_BYTE, nullptr);
                glUseProgram(glsl_program_num);
                glDispatchCompute(WIDTH_BLOCKS / 16, HEIGHT_BLOCKS, 1);
                glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
@@ -309,8 +336,11 @@ int main(int argc, char **argv)
                glDispatchCompute(4, 1, 1);
                glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
        
+               glCopyNamedBufferSubData(dist_ssbo, dist_ubo, 0, 0, sizeof(RansDistUBO));
+               glMemoryBarrier(GL_UNIFORM_BARRIER_BIT);
+
                glUseProgram(glsl_rans_program_num);
-               glDispatchCompute(NUM_BLOCKS / BLOCKS_PER_STREAM, 8, 5);
+               glDispatchCompute(NUM_STREAMS, 8, 5);
        }
        check_error();
        glFinish();
@@ -336,49 +366,53 @@ int main(int argc, char **argv)
        }
 
        // Write out the distributions.
-       const RansDistSSBO *rans_dist = (const RansDistSSBO *)glMapNamedBufferRange(ssbo, 0, 256 * 16 * sizeof(uint32_t), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       const RansCountSSBO *rans_count = (const RansCountSSBO *)glMapNamedBufferRange(ssbo, 0, sizeof(RansCountSSBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       const RansDistUBO *rans_dist = (const RansDistUBO *)glMapNamedBufferRange(dist_ssbo, 0, sizeof(RansDistUBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
        for (unsigned r = 0; r < 2; ++r) {  // Hack to write fake chroma tables.
                // TODO: rather gamma-k or something
                for (unsigned i = 0; i < 4; ++i) {
                        printf("writing table %d\n", i);
                        for (unsigned j = 0; j < NUM_SYMS; ++j) {
-                               printf("%d,%d: start=%d freq=%d\n", i, j, rans_dist->ransdist[i * 256 + j].first, rans_dist->ransdist[i * 256 + j].second);
-                               write_varint(rans_dist->ransdist[i * 256 + j].second, codedfp);
+                               printf("%d,%d: freq=%d  x_max=%d, rcp_freq=%08x, bias=%d, rcp_shift=%d, cmpl_freq=%d\n",
+                                       i, j, rans_count->ransfreq[i * 256 + j],
+                                       rans_dist->ransdist[i * 256 + j].x_max,
+                                       rans_dist->ransdist[i * 256 + j].rcp_freq,
+                                       rans_dist->ransdist[i * 256 + j].bias,
+                                       rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq & 0xffff,
+                                       rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq >> 16);
+                               write_varint(rans_count->ransfreq[i * 256 + j], codedfp);
                        }
                }
        }
 
        // Write out the actual data.
-       // TODO: Do the deduplication.
 
-       const uint32_t *offsets = (const uint32_t *)glMapNamedBufferRange(output_offset_ssbo, 0, 45 * 64 * sizeof(uint32_t), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       const uint32_t *bytes_written = (const uint32_t *)glMapNamedBufferRange(bytes_written_ssbo, 0, 64 * NUM_STREAMS * sizeof(uint32_t), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
 #if 0
-       for (int i = 0; i < 45*64; ++i) {
+       for (int i = 0; i < HEIGHT_BLOCKS*64; ++i) {
                printf("%d,%d,%d: %u\n", i / 64, (i / 8) % 8, i % 8, 1024 * (i + 1) - offsets[i]);
        }
 #endif
 
-       const uint8_t *data = (const uint8_t *)glMapNamedBufferRange(output_ssbo, 0, 45 * 64 * 1024, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       const uint8_t *data = (const uint8_t *)glMapNamedBufferRange(output_ssbo, 0, 64 * NUM_STREAMS * STREAM_BUF_SIZE, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
 
+       string last_block;
        for (unsigned y = 0; y < 8; ++y) {
                for (unsigned x = 0; x < 8; ++x) {
-                       for (unsigned int stream_idx = 0; stream_idx < 45; ++stream_idx) {
-                               const uint8_t *out_end = data + (stream_idx * 64 + y * 8 + x + 1) * 1024;
-                               const uint8_t *ptr = data + offsets[stream_idx * 64 + y * 8 + x];
-                               uint32_t num_rans_bytes = out_end - ptr;
-#if 0
+                       for (unsigned int stream_idx = 0; stream_idx < NUM_STREAMS; ++stream_idx) {
+                               const uint8_t *out_end = data + (stream_idx * 64 + y * 8 + x + 1) * STREAM_BUF_SIZE;
+                               uint32_t num_rans_bytes = bytes_written[stream_idx * 64 + y * 8 + x];
+                               const uint8_t *ptr = out_end - num_rans_bytes;
+                               assert(num_rans_bytes <= STREAM_BUF_SIZE);
+
                                if (num_rans_bytes == last_block.size() &&
                                    memcmp(last_block.data(), ptr, last_block.size()) == 0) {
                                        write_varint(0, codedfp);
-                                       clear();
-                                       return 1;
                                } else {
                                        last_block = string((const char *)ptr, num_rans_bytes);
+                                       write_varint(num_rans_bytes, codedfp);
+                                       fwrite(ptr, 1, num_rans_bytes, codedfp);
                                }
-#endif
-
-                               write_varint(num_rans_bytes, codedfp);
-                               fwrite(ptr, 1, num_rans_bytes, codedfp);
                        }
                }
        }