Parametrize STREAM_BUF_SIZE in the .cpp file.

[narabu] / narabu-encoder.cpp
diff --git a/narabu-encoder.cpp b/narabu-encoder.cpp

index 2ddd8992fd2fe0da3b56bd2b0ec90598ceb93461..b1d0d1f18e02d3ff805e469a596fb20a19b53c25 100644 (file)
--- a/narabu-encoder.cpp
+++ b/narabu-encoder.cpp
@@ -19,8 +19,6 @@
  
  #include <movit/util.h>
  
-#include "ryg_rans/rans_byte.h"
-#include "ryg_rans/renormalize.h"
  #include "util.h"
  
  #define WIDTH 1280
@@ -34,6 +32,7 @@
  #define NUM_SYMS 256
  #define ESCAPE_LIMIT (NUM_SYMS - 1)
  #define BLOCKS_PER_STREAM 320
+#define STREAM_BUF_SIZE 1024  // In bytes.
  
  static constexpr uint32_t prob_bits = 12;
  static constexpr uint32_t prob_scale = 1 << prob_bits;
@@ -43,6 +42,21 @@ unsigned char pix_y[WIDTH * HEIGHT];
  unsigned char pix_cb[(WIDTH/2) * HEIGHT];
  unsigned char pix_cr[(WIDTH/2) * HEIGHT];
  
+struct RansCountSSBO {
+       unsigned dist[4 * 256];
+       unsigned ransfreq[4 * 256];
+};
+
+struct RansDistUBO {
+       struct {
+               uint32_t x_max, rcp_freq, bias, rcp_shift_and_cmpl_freq;
+       } ransdist[4 * 256];
+       struct {
+               uint32_t val;
+               uint32_t padding[3];  // std140 layout.
+       } sign_biases[4];
+};
+
  using namespace std;
  using namespace std::chrono;
  
@@ -72,182 +86,6 @@ void readpix(unsigned char *ptr, const char *filename)
         fclose(fp);
  }
  
-struct SymbolStats
-{
-    uint32_t freqs[NUM_SYMS];
-    uint32_t cum_freqs[NUM_SYMS + 1];
-
-    void clear();
-    void calc_cum_freqs();
-    void normalize_freqs(uint32_t target_total);
-};
-
-void SymbolStats::clear()
-{
-    for (int i=0; i < NUM_SYMS; i++)
-        freqs[i] = 0;
-}
-
-void SymbolStats::calc_cum_freqs()
-{
-    cum_freqs[0] = 0;
-    for (int i=0; i < NUM_SYMS; i++)
-        cum_freqs[i+1] = cum_freqs[i] + freqs[i];
-}
-
-void SymbolStats::normalize_freqs(uint32_t target_total)
-{
-    uint64_t real_freq[NUM_SYMS + 1];  // hack
-
-    assert(target_total >= NUM_SYMS);
-
-    calc_cum_freqs();
-    uint32_t cur_total = cum_freqs[NUM_SYMS];
-
-    if (cur_total == 0) return;
-
-    double ideal_cost = 0.0;
-    for (int i = 1; i <= NUM_SYMS; i++)
-    {
-      real_freq[i] = cum_freqs[i] - cum_freqs[i - 1];
-      if (real_freq[i] > 0)
-        ideal_cost -= real_freq[i] * log2(real_freq[i] / double(cur_total));
-    }
-
-    OptimalRenormalize(cum_freqs, NUM_SYMS, prob_scale);
-
-    // calculate updated freqs and make sure we didn't screw anything up
-    assert(cum_freqs[0] == 0 && cum_freqs[NUM_SYMS] == target_total);
-    for (int i=0; i < NUM_SYMS; i++) {
-        if (freqs[i] == 0)
-            assert(cum_freqs[i+1] == cum_freqs[i]);
-        else
-            assert(cum_freqs[i+1] > cum_freqs[i]);
-
-        // calc updated freq
-        freqs[i] = cum_freqs[i+1] - cum_freqs[i];
-    }
-
-    double calc_cost = 0.0;
-    for (int i = 1; i <= NUM_SYMS; i++)
-    {
-      uint64_t freq = cum_freqs[i] - cum_freqs[i - 1];
-      if (real_freq[i] > 0)
-        calc_cost -= real_freq[i] * log2(freq / double(target_total));
-    }
-
-    static double total_loss = 0.0;
-    total_loss += calc_cost - ideal_cost;
-    static double total_loss_with_dp = 0.0;
-       double optimal_cost = 0.0;
-    //total_loss_with_dp += optimal_cost - ideal_cost;
-    printf("ideal cost = %.0f bits, DP cost = %.0f bits, calc cost = %.0f bits (loss = %.2f bytes, total loss = %.2f bytes, total loss with DP = %.2f bytes)\n",
-               ideal_cost, optimal_cost,
-                calc_cost, (calc_cost - ideal_cost) / 8.0, total_loss / 8.0, total_loss_with_dp / 8.0);
-}
-
-SymbolStats stats[128];
-
-const int luma_mapping[64] = {
-       0, 0, 1, 1, 2, 2, 3, 3,
-       0, 0, 1, 2, 2, 2, 3, 3,
-       1, 1, 2, 2, 2, 3, 3, 3,
-       1, 1, 2, 2, 2, 3, 3, 3,
-       1, 2, 2, 2, 2, 3, 3, 3,
-       2, 2, 2, 2, 3, 3, 3, 3,
-       2, 2, 3, 3, 3, 3, 3, 3,
-       3, 3, 3, 3, 3, 3, 3, 3,
-};
-
-int pick_stats_for(int x, int y)
-{
-       return luma_mapping[y * 8 + x];
-}
-
-class RansEncoder {
-public:
-       RansEncoder()
-       {
-               out_buf.reset(new uint8_t[out_max_size]);
-               clear();
-       }
-
-       void init_prob(SymbolStats &s)
-       {
-               for (int i = 0; i < NUM_SYMS; i++) {
-                       //printf("%d: cumfreqs=%d freqs=%d prob_bits=%d\n", i, s.cum_freqs[i], s.freqs[i], prob_bits + 1);
-                       RansEncSymbolInit(&esyms[i], s.cum_freqs[i], s.freqs[i], prob_bits + 1);
-               }
-               sign_bias = s.cum_freqs[NUM_SYMS];
-       }
-
-       void clear()
-       {
-               out_end = out_buf.get() + out_max_size;
-               ptr = out_end; // *end* of output buffer
-               RansEncInit(&rans);
-       }
-
-       uint32_t save_block(FILE *codedfp)  // Returns number of bytes.
-       {
-               RansEncFlush(&rans, &ptr);
-               //printf("post-flush = %08x\n", rans);
-
-               uint32_t num_rans_bytes = out_end - ptr;
-               if (num_rans_bytes == last_block.size() &&
-                   memcmp(last_block.data(), ptr, last_block.size()) == 0) {
-                       write_varint(0, codedfp);
-                       clear();
-                       return 1;
-               } else {
-                       last_block = string((const char *)ptr, num_rans_bytes);
-               }
-
-               write_varint(num_rans_bytes, codedfp);
-               //fwrite(&num_rans_bytes, 1, 4, codedfp);
-               fwrite(ptr, 1, num_rans_bytes, codedfp);
-
-               //printf("first rANS bytes: %02x %02x %02x %02x %02x %02x %02x %02x\n", ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7]);
-
-
-               clear();
-
-               //printf("Saving block: %d rANS bytes\n", num_rans_bytes);
-               return num_rans_bytes;
-               //return num_rans_bytes;
-       }
-
-       void encode_coeff(short signed_k)
-       {
-               //printf("encoding coeff %d (sym %d), rans before encoding = %08x\n", signed_k, ((abs(signed_k) - 1) & 255), rans);
-               unsigned short k = abs(signed_k);
-               if (k >= ESCAPE_LIMIT) {
-                       // Put the coefficient as a 1/(2^12) symbol _before_
-                       // the 255 coefficient, since the decoder will read the
-                       // 255 coefficient first.
-                       RansEncPut(&rans, &ptr, k, 1, prob_bits);
-                       k = ESCAPE_LIMIT;
-               }
-               RansEncPutSymbol(&rans, &ptr, &esyms[(k - 1) & (NUM_SYMS - 1)]);
-               if (signed_k < 0) {
-                       rans += sign_bias;
-               }
-       }
-
-private:
-       static constexpr size_t out_max_size = 32 << 20; // 32 MB.
-       static constexpr size_t max_num_sign = 1048576;  // Way too big. And actually bytes.
-
-       unique_ptr<uint8_t[]> out_buf;
-       uint8_t *out_end;
-       uint8_t *ptr;
-       RansState rans;
-       RansEncSymbol esyms[NUM_SYMS];
-       uint32_t sign_bias;
-
-       std::string last_block;
-};
-
  // Should be done on the GPU, of course, but irrelevant for the demonstration.
  void convert_ycbcr()
  {
@@ -316,7 +154,7 @@ int main(int argc, char **argv)
                 readpix(rgb, "color.pnm");
         convert_ycbcr();
  
-       // Compile the shader.
+       // Compile the DCT shader.
         string shader_src = ::read_file("encoder.shader");
         GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
         GLuint glsl_program_num = glCreateProgram();
@@ -332,7 +170,85 @@ int main(int argc, char **argv)
                 exit(1);
         }
  
+       // Compile the tally shader.
+       shader_src = ::read_file("tally.shader");
+       shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
+       GLuint glsl_tally_program_num = glCreateProgram();
+       glAttachShader(glsl_tally_program_num, shader_num);
+       glLinkProgram(glsl_tally_program_num);
+
+       glGetProgramiv(glsl_tally_program_num, GL_LINK_STATUS, &success);
+       if (success == GL_FALSE) {
+               GLchar error_log[1024] = {0};
+               glGetProgramInfoLog(glsl_tally_program_num, 1024, nullptr, error_log);
+               fprintf(stderr, "Error linking program: %s\n", error_log);
+               exit(1);
+       }
+
+       // Compile the rANS shader.
+       shader_src = ::read_file("rans.shader");
+       shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
+       GLuint glsl_rans_program_num = glCreateProgram();
+       glAttachShader(glsl_rans_program_num, shader_num);
+       glLinkProgram(glsl_rans_program_num);
+
+       glGetProgramiv(glsl_rans_program_num, GL_LINK_STATUS, &success);
+       if (success == GL_FALSE) {
+               GLchar error_log[1024] = {0};
+               glGetProgramInfoLog(glsl_rans_program_num, 1024, nullptr, error_log);
+               fprintf(stderr, "Error linking program: %s\n", error_log);
+               exit(1);
+       }
+       check_error();
+
+       // An SSBO for the raw rANS counts.
+       GLuint ssbo;
+       glGenBuffers(1, &ssbo);
+       glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
+       glNamedBufferStorage(ssbo, sizeof(RansCountSSBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       check_error();
+
+       // UBO for the rANS distributions (copied from an SSBO).
+       GLuint dist_ssbo;
+       glGenBuffers(1, &dist_ssbo);
+       glBindBuffer(GL_SHADER_STORAGE_BUFFER, dist_ssbo);
+       glNamedBufferStorage(dist_ssbo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       check_error();
+
+       GLuint dist_ubo;
+       glGenBuffers(1, &dist_ubo);
+       glBindBuffer(GL_UNIFORM_BUFFER, dist_ubo);
+       glNamedBufferStorage(dist_ubo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       check_error();
+
+       // SSBOs for the rANS output (data and offsets).
+       GLuint output_ssbo;
+       glGenBuffers(1, &output_ssbo);
+       glBindBuffer(GL_SHADER_STORAGE_BUFFER, output_ssbo);
+       glNamedBufferStorage(output_ssbo, HEIGHT_BLOCKS * WIDTH_BLOCKS * STREAM_BUF_SIZE, nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       check_error();
+
+       GLuint bytes_written_ssbo;
+       glGenBuffers(1, &bytes_written_ssbo);
+       glBindBuffer(GL_SHADER_STORAGE_BUFFER, bytes_written_ssbo);
+       glNamedBufferStorage(bytes_written_ssbo, HEIGHT_BLOCKS * WIDTH_BLOCKS * sizeof(uint32_t), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       check_error();
+
+       // Bind SSBOs.
         glUseProgram(glsl_program_num);
+       glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
+
+       glUseProgram(glsl_tally_program_num);
+       glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
+       glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 12, dist_ssbo);
+
+       glUseProgram(glsl_rans_program_num);
+       glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, output_ssbo);
+       glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, bytes_written_ssbo);
+       glBindBufferBase(GL_UNIFORM_BUFFER, 13, dist_ubo);
+
+       glUseProgram(glsl_program_num);
+       check_error();
  
         // Upload luma.
         GLuint y_tex;
@@ -370,149 +286,76 @@ int main(int argc, char **argv)
                 check_error();
         }
  
+       glBindImageTexture(0, dc_ac7_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R16UI);
+       glBindImageTexture(1, ac1_ac6_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R16UI);
+       glBindImageTexture(2, ac2_ac5_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R16UI);
+       glBindImageTexture(3, ac3_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R8I);
+       glBindImageTexture(4, ac4_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R8I);
+       glBindImageTexture(5, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
+       check_error();
+
+       // Bind uniforms.
+       glUseProgram(glsl_program_num);
         GLint dc_ac7_tex_uniform = glGetUniformLocation(glsl_program_num, "dc_ac7_tex");
         GLint ac1_ac6_tex_uniform = glGetUniformLocation(glsl_program_num, "ac1_ac6_tex");
         GLint ac2_ac5_tex_uniform = glGetUniformLocation(glsl_program_num, "ac2_ac5_tex");
         GLint ac3_tex_uniform = glGetUniformLocation(glsl_program_num, "ac3_tex");
         GLint ac4_tex_uniform = glGetUniformLocation(glsl_program_num, "ac4_tex");
         GLint image_tex_uniform = glGetUniformLocation(glsl_program_num, "image_tex");
-
         glUniform1i(dc_ac7_tex_uniform, 0);
         glUniform1i(ac1_ac6_tex_uniform, 1);
         glUniform1i(ac2_ac5_tex_uniform, 2);
         glUniform1i(ac3_tex_uniform, 3);
         glUniform1i(ac4_tex_uniform, 4);
         glUniform1i(image_tex_uniform, 5);
-       glBindImageTexture(0, dc_ac7_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
-       glBindImageTexture(1, ac1_ac6_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
-       glBindImageTexture(2, ac2_ac5_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
-       glBindImageTexture(3, ac3_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I);
-       glBindImageTexture(4, ac4_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I);
-       glBindImageTexture(5, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
-       check_error();
+
+       glUseProgram(glsl_rans_program_num);
+       dc_ac7_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "dc_ac7_tex");
+       ac1_ac6_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac1_ac6_tex");
+       ac2_ac5_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac2_ac5_tex");
+       ac3_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac3_tex");
+       ac4_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac4_tex");
+       image_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "image_tex");
+       glUniform1i(dc_ac7_tex_uniform, 0);
+       glUniform1i(ac1_ac6_tex_uniform, 1);
+       glUniform1i(ac2_ac5_tex_uniform, 2);
+       glUniform1i(ac3_tex_uniform, 3);
+       glUniform1i(ac4_tex_uniform, 4);
  
         steady_clock::time_point start = steady_clock::now();
-       unsigned num_iterations = 1000;
+       unsigned num_iterations = 100;
         for (unsigned i = 0; i < num_iterations; ++i) {
-               glDispatchCompute(WIDTH_BLOCKS, HEIGHT_BLOCKS, 1);
+               glClearNamedBufferSubData(ssbo, GL_R8, 0, sizeof(RansCountSSBO), GL_RED, GL_UNSIGNED_BYTE, nullptr);
+               glUseProgram(glsl_program_num);
+               glDispatchCompute(WIDTH_BLOCKS / 16, HEIGHT_BLOCKS, 1);
+               glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+
+               glUseProgram(glsl_tally_program_num);
+               glDispatchCompute(4, 1, 1);
+               glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+       
+               glCopyNamedBufferSubData(dist_ssbo, dist_ubo, 0, 0, sizeof(RansDistUBO));
+               glMemoryBarrier(GL_UNIFORM_BARRIER_BIT);
+
+               glUseProgram(glsl_rans_program_num);
+               glDispatchCompute(NUM_BLOCKS / BLOCKS_PER_STREAM, 8, 5);
         }
         check_error();
         glFinish();
-       steady_clock::time_point now = steady_clock::now();
-
-       // CPU part starts here -- will be GPU later.
-       // We only do luma for now.
-
-       int16_t *coeff_y = new int16_t[WIDTH * HEIGHT];
-
-       glBindTexture(GL_TEXTURE_2D, dc_ac7_tex);
-       uint16_t *dc_ac7_data = new uint16_t[(WIDTH/8) * HEIGHT];
-       glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, dc_ac7_data);
-       check_error();
-
-       glBindTexture(GL_TEXTURE_2D, ac1_ac6_tex);
-       uint16_t *ac1_ac6_data = new uint16_t[(WIDTH/8) * HEIGHT];
-       glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ac1_ac6_data);
-       check_error();
-
-       glBindTexture(GL_TEXTURE_2D, ac2_ac5_tex);
-       uint16_t *ac2_ac5_data = new uint16_t[(WIDTH/8) * HEIGHT];
-       glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ac2_ac5_data);
-       check_error();
-
-       glBindTexture(GL_TEXTURE_2D, ac3_tex);
-       int8_t *ac3_data = new int8_t[(WIDTH/8) * HEIGHT];
-       glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_BYTE, ac3_data);
-       check_error();
-
-       glBindTexture(GL_TEXTURE_2D, ac4_tex);
-       int8_t *ac4_data = new int8_t[(WIDTH/8) * HEIGHT];
-       glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_BYTE, ac4_data);
         check_error();
+       steady_clock::time_point now = steady_clock::now();
  
-       for (unsigned y = 0; y < HEIGHT; ++y) {
-               for (unsigned xb = 0; xb < WIDTH/8; ++xb) {
-                       coeff_y[y * WIDTH + xb*8 + 0] = int(dc_ac7_data[y * (WIDTH/8) + xb] << 23) >> 23;
-                       coeff_y[y * WIDTH + xb*8 + 7] = int(dc_ac7_data[y * (WIDTH/8) + xb] << 16) >> 25;
-                       coeff_y[y * WIDTH + xb*8 + 1] = int(ac1_ac6_data[y * (WIDTH/8) + xb] << 23) >> 23;
-                       coeff_y[y * WIDTH + xb*8 + 6] = int(ac1_ac6_data[y * (WIDTH/8) + xb] << 16) >> 25;
-                       coeff_y[y * WIDTH + xb*8 + 2] = int(ac2_ac5_data[y * (WIDTH/8) + xb] << 23) >> 23;
-                       coeff_y[y * WIDTH + xb*8 + 5] = int(ac2_ac5_data[y * (WIDTH/8) + xb] << 16) >> 25;
-                       coeff_y[y * WIDTH + xb*8 + 3] = ac3_data[y * (WIDTH/8) + xb];
-                       coeff_y[y * WIDTH + xb*8 + 4] = ac4_data[y * (WIDTH/8) + xb];
-               }
-       }
+#if 0
+       printf("%ld bytes + %ld escape bits (%ld) = %ld total bytes\n",
+               tot_bytes - extra_bits / 8,
+               extra_bits,
+               extra_bits / 8,
+               tot_bytes);
  
-#if 1
-       for (unsigned y = 0; y < HEIGHT; ++y) {
-               for (unsigned xb = 0; xb < WIDTH/8; ++xb) {
-                       printf("%4d %4d %4d %4d %4d %4d %4d %4d | ",
-                               coeff_y[y * WIDTH + xb*8 + 0],
-                               coeff_y[y * WIDTH + xb*8 + 1],
-                               coeff_y[y * WIDTH + xb*8 + 2],
-                               coeff_y[y * WIDTH + xb*8 + 3],
-                               coeff_y[y * WIDTH + xb*8 + 4],
-                               coeff_y[y * WIDTH + xb*8 + 5],
-                               coeff_y[y * WIDTH + xb*8 + 6],
-                               coeff_y[y * WIDTH + xb*8 + 7]);
-                       printf("%4d %4d %4d %4d %4d %4d %4d %4d || ",
-                               pix_y[y * WIDTH + xb*8 + 0],
-                               pix_y[y * WIDTH + xb*8 + 1],
-                               pix_y[y * WIDTH + xb*8 + 2],
-                               pix_y[y * WIDTH + xb*8 + 3],
-                               pix_y[y * WIDTH + xb*8 + 4],
-                               pix_y[y * WIDTH + xb*8 + 5],
-                               pix_y[y * WIDTH + xb*8 + 6],
-                               pix_y[y * WIDTH + xb*8 + 7]);
-               }
-               printf("\n");
-       }
+       printf("\n");
  #endif
  
-       // DC coefficient pred from the right to left (within each slice)
-       for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; block_idx += BLOCKS_PER_STREAM) {
-               int prev_k = 128;
-
-               for (unsigned subblock_idx = BLOCKS_PER_STREAM; subblock_idx --> 0; ) {
-                       unsigned yb = (block_idx + subblock_idx) / WIDTH_BLOCKS;
-                       unsigned xb = (block_idx + subblock_idx) % WIDTH_BLOCKS;
-                       int k = coeff_y[(yb * 8) * WIDTH + xb * 8];
-
-                       coeff_y[(yb * 8) * WIDTH + xb * 8] = k - prev_k;
-
-                       prev_k = k;
-               }
-       }
-
-       // For each coefficient, make some tables.
-       size_t extra_bits = 0;
-       for (unsigned i = 0; i < 64; ++i) {
-               stats[i].clear();
-       }
-       for (unsigned y = 0; y < 8; ++y) {
-               for (unsigned x = 0; x < 8; ++x) {
-                       SymbolStats &s_luma = stats[pick_stats_for(x, y)];
-
-                       // Luma
-                       for (unsigned yb = 0; yb < HEIGHT; yb += 8) {
-                               for (unsigned xb = 0; xb < WIDTH; xb += 8) {
-                                       unsigned short k = abs(coeff_y[(yb + y) * WIDTH + (xb + x)]);
-                                       if (k >= ESCAPE_LIMIT) {
-                                               k = ESCAPE_LIMIT;
-                                               extra_bits += 12;  // escape this one
-                                       }
-                                       ++s_luma.freqs[(k - 1) & (NUM_SYMS - 1)];
-                               }
-                       }
-               }
-       }
-
-       for (unsigned i = 0; i < 64; ++i) {
-               stats[i].freqs[NUM_SYMS - 1] /= 2;  // zero, has no sign bits (yes, this is trickery)
-               stats[i].normalize_freqs(prob_scale);
-               stats[i].cum_freqs[NUM_SYMS] += stats[i].freqs[NUM_SYMS - 1];
-               stats[i].freqs[NUM_SYMS - 1] *= 2;
-       }
+       printf("Each iteration took %.3f ms.\n", 1e3 * duration<double>(now - start).count() / num_iterations);
  
         FILE *codedfp = fopen("coded.dat", "wb");
         if (codedfp == nullptr) {
@@ -520,59 +363,56 @@ int main(int argc, char **argv)
                 exit(1);
         }
  
+       // Write out the distributions.
+       const RansCountSSBO *rans_count = (const RansCountSSBO *)glMapNamedBufferRange(ssbo, 0, sizeof(RansCountSSBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       const RansDistUBO *rans_dist = (const RansDistUBO *)glMapNamedBufferRange(dist_ssbo, 0, sizeof(RansDistUBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
         for (unsigned r = 0; r < 2; ++r) {  // Hack to write fake chroma tables.
                 // TODO: rather gamma-k or something
-               for (unsigned i = 0; i < 64; ++i) {
-                       if (stats[i].cum_freqs[NUM_SYMS] == 0) {
-                               continue;
-                       }
+               for (unsigned i = 0; i < 4; ++i) {
                         printf("writing table %d\n", i);
                         for (unsigned j = 0; j < NUM_SYMS; ++j) {
-                               write_varint(stats[i].freqs[j], codedfp);
+                               printf("%d,%d: freq=%d  x_max=%d, rcp_freq=%08x, bias=%d, rcp_shift=%d, cmpl_freq=%d\n",
+                                       i, j, rans_count->ransfreq[i * 256 + j],
+                                       rans_dist->ransdist[i * 256 + j].x_max,
+                                       rans_dist->ransdist[i * 256 + j].rcp_freq,
+                                       rans_dist->ransdist[i * 256 + j].bias,
+                                       rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq & 0xffff,
+                                       rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq >> 16);
+                               write_varint(rans_count->ransfreq[i * 256 + j], codedfp);
                         }
                 }
         }
  
-       RansEncoder rans_encoder;
+       // Write out the actual data.
  
-       size_t tot_bytes = 0;
+       const uint32_t *bytes_written = (const uint32_t *)glMapNamedBufferRange(bytes_written_ssbo, 0, HEIGHT_BLOCKS * WIDTH_BLOCKS * sizeof(uint32_t), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+#if 0
+       for (int i = 0; i < HEIGHT_BLOCKS*64; ++i) {
+               printf("%d,%d,%d: %u\n", i / 64, (i / 8) % 8, i % 8, 1024 * (i + 1) - offsets[i]);
+       }
+#endif
  
-       // Luma
+       const uint8_t *data = (const uint8_t *)glMapNamedBufferRange(output_ssbo, 0, HEIGHT_BLOCKS * WIDTH_BLOCKS * STREAM_BUF_SIZE, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+
+       string last_block;
         for (unsigned y = 0; y < 8; ++y) {
                 for (unsigned x = 0; x < 8; ++x) {
-                       SymbolStats &s_luma = stats[pick_stats_for(x, y)];
-                       rans_encoder.init_prob(s_luma);
-
-                       // Luma
-                       std::vector<int> lens;
-
-                       rans_encoder.clear();
-                       size_t num_bytes = 0;
-                       for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; ++block_idx) {
-                               unsigned yb = block_idx / WIDTH_BLOCKS;
-                               unsigned xb = block_idx % WIDTH_BLOCKS;
-
-                               int k = coeff_y[(yb * 8 + y) * WIDTH + (xb * 8 + x)];
-                               rans_encoder.encode_coeff(k);
-
-                               if (block_idx % BLOCKS_PER_STREAM == (BLOCKS_PER_STREAM - 1) || block_idx == NUM_BLOCKS - 1) {
-                                       int l = rans_encoder.save_block(codedfp);
-                                       num_bytes += l;
-                                       lens.push_back(l);
+                       for (unsigned int stream_idx = 0; stream_idx < HEIGHT_BLOCKS; ++stream_idx) {
+                               const uint8_t *out_end = data + (stream_idx * 64 + y * 8 + x + 1) * STREAM_BUF_SIZE;
+                               uint32_t num_rans_bytes = bytes_written[stream_idx * 64 + y * 8 + x];
+                               const uint8_t *ptr = out_end - num_rans_bytes;
+                               assert(num_rans_bytes <= STREAM_BUF_SIZE);
+
+                               if (num_rans_bytes == last_block.size() &&
+                                   memcmp(last_block.data(), ptr, last_block.size()) == 0) {
+                                       write_varint(0, codedfp);
+                               } else {
+                                       last_block = string((const char *)ptr, num_rans_bytes);
+                                       write_varint(num_rans_bytes, codedfp);
+                                       fwrite(ptr, 1, num_rans_bytes, codedfp);
                                 }
                         }
-                       tot_bytes += num_bytes;
-                       printf("coeff %d Y': %ld bytes\n", y * 8 + x, num_bytes);
                 }
         }
-
-       printf("%ld bytes + %ld escape bits (%ld) = %ld total bytes\n",
-               tot_bytes - extra_bits / 8,
-               extra_bits,
-               extra_bits / 8,
-               tot_bytes);
-
-       printf("\n");
-       printf("Each iteration took %.3f ms (but note that is DCT only, no rANS).\n", 1e3 * duration<double>(now - start).count() / num_iterations);
-
+       fclose(codedfp);
  }