git.sesse.net Git - narabu/blob - narabu-encoder.cpp

   1 #include <stdio.h>
   2 #include <stdint.h>
   3 #include <stdlib.h>
   4 #include <string.h>
   5 #include <assert.h>
   6 #include <math.h>
   7 #include <SDL2/SDL.h>
   8 #include <SDL2/SDL_error.h>
   9 #include <SDL2/SDL_video.h>
  10 #include <epoxy/gl.h>
  11
  12 #include <algorithm>
  13 #include <chrono>
  14 #include <memory>
  15 #include <numeric>
  16 #include <random>
  17 #include <vector>
  18 #include <unordered_map>
  19
  20 #include <movit/util.h>
  21
  22 #include "ryg_rans/rans_byte.h"
  23 #include "ryg_rans/renormalize.h"
  24 #include "util.h"
  25
  26 #define WIDTH 1280
  27 #define HEIGHT 720
  28 #define WIDTH_BLOCKS (WIDTH/8)
  29 #define WIDTH_BLOCKS_CHROMA (WIDTH/16)
  30 #define HEIGHT_BLOCKS (HEIGHT/8)
  31 #define NUM_BLOCKS (WIDTH_BLOCKS * HEIGHT_BLOCKS)
  32 #define NUM_BLOCKS_CHROMA (WIDTH_BLOCKS_CHROMA * HEIGHT_BLOCKS)
  33
  34 #define NUM_SYMS 256
  35 #define ESCAPE_LIMIT (NUM_SYMS - 1)
  36 #define BLOCKS_PER_STREAM 320
  37
  38 static constexpr uint32_t prob_bits = 12;
  39 static constexpr uint32_t prob_scale = 1 << prob_bits;
  40
  41 unsigned char rgb[WIDTH * HEIGHT * 3];
  42 unsigned char pix_y[WIDTH * HEIGHT];
  43 unsigned char pix_cb[(WIDTH/2) * HEIGHT];
  44 unsigned char pix_cr[(WIDTH/2) * HEIGHT];
  45
  46 using namespace std;
  47 using namespace std::chrono;
  48
  49 void write_varint(int x, FILE *fp)
  50 {
  51         while (x >= 128) {
  52                 putc((x & 0x7f) | 0x80, fp);
  53                 x >>= 7;
  54         }
  55         putc(x, fp);
  56 }
  57
  58 void readpix(unsigned char *ptr, const char *filename)
  59 {
  60         FILE *fp = fopen(filename, "rb");
  61         if (fp == nullptr) {
  62                 perror(filename);
  63                 exit(1);
  64         }
  65
  66         fseek(fp, 0, SEEK_END);
  67         long len = ftell(fp);
  68         assert(len >= WIDTH * HEIGHT * 3);
  69         fseek(fp, len - WIDTH * HEIGHT * 3, SEEK_SET);
  70
  71         fread(ptr, 1, WIDTH * HEIGHT * 3, fp);
  72         fclose(fp);
  73 }
  74
  75 struct SymbolStats
  76 {
  77     uint32_t freqs[NUM_SYMS];
  78     uint32_t cum_freqs[NUM_SYMS + 1];
  79
  80     void clear();
  81     void calc_cum_freqs();
  82     void normalize_freqs(uint32_t target_total);
  83 };
  84
  85 void SymbolStats::clear()
  86 {
  87     for (int i=0; i < NUM_SYMS; i++)
  88         freqs[i] = 0;
  89 }
  90
  91 void SymbolStats::calc_cum_freqs()
  92 {
  93     cum_freqs[0] = 0;
  94     for (int i=0; i < NUM_SYMS; i++)
  95         cum_freqs[i+1] = cum_freqs[i] + freqs[i];
  96 }
  97
  98 void SymbolStats::normalize_freqs(uint32_t target_total)
  99 {
 100     uint64_t real_freq[NUM_SYMS + 1];  // hack
 101
 102     assert(target_total >= NUM_SYMS);
 103
 104     calc_cum_freqs();
 105     uint32_t cur_total = cum_freqs[NUM_SYMS];
 106
 107     if (cur_total == 0) return;
 108
 109     double ideal_cost = 0.0;
 110     for (int i = 1; i <= NUM_SYMS; i++)
 111     {
 112       real_freq[i] = cum_freqs[i] - cum_freqs[i - 1];
 113       if (real_freq[i] > 0)
 114         ideal_cost -= real_freq[i] * log2(real_freq[i] / double(cur_total));
 115     }
 116
 117     OptimalRenormalize(cum_freqs, NUM_SYMS, prob_scale);
 118
 119     // calculate updated freqs and make sure we didn't screw anything up
 120     assert(cum_freqs[0] == 0 && cum_freqs[NUM_SYMS] == target_total);
 121     for (int i=0; i < NUM_SYMS; i++) {
 122         if (freqs[i] == 0)
 123             assert(cum_freqs[i+1] == cum_freqs[i]);
 124         else
 125             assert(cum_freqs[i+1] > cum_freqs[i]);
 126
 127         // calc updated freq
 128         freqs[i] = cum_freqs[i+1] - cum_freqs[i];
 129     }
 130
 131     double calc_cost = 0.0;
 132     for (int i = 1; i <= NUM_SYMS; i++)
 133     {
 134       uint64_t freq = cum_freqs[i] - cum_freqs[i - 1];
 135       if (real_freq[i] > 0)
 136         calc_cost -= real_freq[i] * log2(freq / double(target_total));
 137     }
 138
 139     static double total_loss = 0.0;
 140     total_loss += calc_cost - ideal_cost;
 141     static double total_loss_with_dp = 0.0;
 142         double optimal_cost = 0.0;
 143     //total_loss_with_dp += optimal_cost - ideal_cost;
 144     printf("ideal cost = %.0f bits, DP cost = %.0f bits, calc cost = %.0f bits (loss = %.2f bytes, total loss = %.2f bytes, total loss with DP = %.2f bytes)\n",
 145                 ideal_cost, optimal_cost,
 146                  calc_cost, (calc_cost - ideal_cost) / 8.0, total_loss / 8.0, total_loss_with_dp / 8.0);
 147 }
 148
 149 SymbolStats stats[128];
 150
 151 const int luma_mapping[64] = {
 152         0, 0, 1, 1, 2, 2, 3, 3,
 153         0, 0, 1, 2, 2, 2, 3, 3,
 154         1, 1, 2, 2, 2, 3, 3, 3,
 155         1, 1, 2, 2, 2, 3, 3, 3,
 156         1, 2, 2, 2, 2, 3, 3, 3,
 157         2, 2, 2, 2, 3, 3, 3, 3,
 158         2, 2, 3, 3, 3, 3, 3, 3,
 159         3, 3, 3, 3, 3, 3, 3, 3,
 160 };
 161
 162 int pick_stats_for(int x, int y)
 163 {
 164         return luma_mapping[y * 8 + x];
 165 }
 166
 167 class RansEncoder {
 168 public:
 169         RansEncoder()
 170         {
 171                 out_buf.reset(new uint8_t[out_max_size]);
 172                 clear();
 173         }
 174
 175         void init_prob(SymbolStats &s)
 176         {
 177                 for (int i = 0; i < NUM_SYMS; i++) {
 178                         //printf("%d: cumfreqs=%d freqs=%d prob_bits=%d\n", i, s.cum_freqs[i], s.freqs[i], prob_bits + 1);
 179                         RansEncSymbolInit(&esyms[i], s.cum_freqs[i], s.freqs[i], prob_bits + 1);
 180                 }
 181                 sign_bias = s.cum_freqs[NUM_SYMS];
 182         }
 183
 184         void clear()
 185         {
 186                 out_end = out_buf.get() + out_max_size;
 187                 ptr = out_end; // *end* of output buffer
 188                 RansEncInit(&rans);
 189         }
 190
 191         uint32_t save_block(FILE *codedfp)  // Returns number of bytes.
 192         {
 193                 RansEncFlush(&rans, &ptr);
 194                 //printf("post-flush = %08x\n", rans);
 195
 196                 uint32_t num_rans_bytes = out_end - ptr;
 197                 if (num_rans_bytes == last_block.size() &&
 198                     memcmp(last_block.data(), ptr, last_block.size()) == 0) {
 199                         write_varint(0, codedfp);
 200                         clear();
 201                         return 1;
 202                 } else {
 203                         last_block = string((const char *)ptr, num_rans_bytes);
 204                 }
 205
 206                 write_varint(num_rans_bytes, codedfp);
 207                 //fwrite(&num_rans_bytes, 1, 4, codedfp);
 208                 fwrite(ptr, 1, num_rans_bytes, codedfp);
 209
 210                 //printf("first rANS bytes: %02x %02x %02x %02x %02x %02x %02x %02x\n", ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7]);
 211
 212
 213                 clear();
 214
 215                 //printf("Saving block: %d rANS bytes\n", num_rans_bytes);
 216                 return num_rans_bytes;
 217                 //return num_rans_bytes;
 218         }
 219
 220         void encode_coeff(short signed_k)
 221         {
 222                 //printf("encoding coeff %d (sym %d), rans before encoding = %08x\n", signed_k, ((abs(signed_k) - 1) & 255), rans);
 223                 unsigned short k = abs(signed_k);
 224                 if (k >= ESCAPE_LIMIT) {
 225                         // Put the coefficient as a 1/(2^12) symbol _before_
 226                         // the 255 coefficient, since the decoder will read the
 227                         // 255 coefficient first.
 228                         RansEncPut(&rans, &ptr, k, 1, prob_bits);
 229                         k = ESCAPE_LIMIT;
 230                 }
 231                 RansEncPutSymbol(&rans, &ptr, &esyms[(k - 1) & (NUM_SYMS - 1)]);
 232                 if (signed_k < 0) {
 233                         rans += sign_bias;
 234                 }
 235         }
 236
 237 private:
 238         static constexpr size_t out_max_size = 32 << 20; // 32 MB.
 239         static constexpr size_t max_num_sign = 1048576;  // Way too big. And actually bytes.
 240
 241         unique_ptr<uint8_t[]> out_buf;
 242         uint8_t *out_end;
 243         uint8_t *ptr;
 244         RansState rans;
 245         RansEncSymbol esyms[NUM_SYMS];
 246         uint32_t sign_bias;
 247
 248         std::string last_block;
 249 };
 250
 251 // Should be done on the GPU, of course, but irrelevant for the demonstration.
 252 void convert_ycbcr()
 253 {
 254         double coeff[3] = { 0.2126, 0.7152, 0.0722 };  // sum = 1.0
 255         double cb_fac = 1.0 / (coeff[0] + coeff[1] + 1.0f - coeff[2]);  // 0.539
 256         double cr_fac = 1.0 / (1.0f - coeff[0] + coeff[1] + coeff[2]);  // 0.635
 257
 258         unique_ptr<float[]> temp_cb(new float[WIDTH * HEIGHT]);
 259         unique_ptr<float[]> temp_cr(new float[WIDTH * HEIGHT]);
 260         for (unsigned yb = 0; yb < HEIGHT; ++yb) {
 261                 for (unsigned xb = 0; xb < WIDTH; ++xb) {
 262                         int r = rgb[((yb * WIDTH) + xb) * 3 + 0];
 263                         int g = rgb[((yb * WIDTH) + xb) * 3 + 1];
 264                         int b = rgb[((yb * WIDTH) + xb) * 3 + 2];
 265                         double y = std::min(std::max(coeff[0] * r + coeff[1] * g + coeff[2] * b, 0.0), 255.0);
 266                         double cb = (b - y) * cb_fac + 128.0;
 267                         double cr = (r - y) * cr_fac + 128.0;
 268                         pix_y[(yb * WIDTH) + xb] = lrint(y);
 269                         temp_cb[(yb * WIDTH) + xb] = cb;
 270                         temp_cr[(yb * WIDTH) + xb] = cr;
 271                 }
 272         }
 273
 274         // Simple 4:2:2 subsampling with left convention.
 275         for (unsigned yb = 0; yb < HEIGHT; ++yb) {
 276                 for (unsigned xb = 0; xb < WIDTH / 2; ++xb) {
 277                         int c0 = yb * WIDTH + std::max(int(xb) * 2 - 1, 0);
 278                         int c1 = yb * WIDTH + xb * 2;
 279                         int c2 = yb * WIDTH + xb * 2 + 1;
 280
 281                         double cb = 0.25 * temp_cb[c0] + 0.5 * temp_cb[c1] + 0.25 * temp_cb[c2];
 282                         double cr = 0.25 * temp_cr[c0] + 0.5 * temp_cr[c1] + 0.25 * temp_cr[c2];
 283                         cb = std::min(std::max(cb, 0.0), 255.0);
 284                         cr = std::min(std::max(cr, 0.0), 255.0);
 285                         pix_cb[(yb * WIDTH/2) + xb] = lrint(cb);
 286                         pix_cr[(yb * WIDTH/2) + xb] = lrint(cr);
 287                 }
 288         }
 289 }
 290
 291 int main(int argc, char **argv)
 292 {
 293         // Set up an OpenGL context using SDL.
 294         if (SDL_Init(SDL_INIT_VIDEO) == -1) {
 295                 fprintf(stderr, "SDL_Init failed: %s\n", SDL_GetError());
 296                 exit(1);
 297         }
 298         SDL_GL_SetAttribute(SDL_GL_DEPTH_SIZE, 0);
 299         SDL_GL_SetAttribute(SDL_GL_STENCIL_SIZE, 0);
 300         SDL_GL_SetAttribute(SDL_GL_DOUBLEBUFFER, 1);
 301         SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
 302         SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
 303         SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 5);
 304
 305         SDL_Window *window = SDL_CreateWindow("OpenGL window for unit test",
 306                 SDL_WINDOWPOS_UNDEFINED,
 307                 SDL_WINDOWPOS_UNDEFINED,
 308                 32, 32,
 309                 SDL_WINDOW_OPENGL);
 310         SDL_GLContext context = SDL_GL_CreateContext(window);
 311         assert(context != nullptr);
 312
 313         if (argc >= 2)
 314                 readpix(rgb, argv[1]);
 315         else
 316                 readpix(rgb, "color.pnm");
 317         convert_ycbcr();
 318
 319         // Compile the shader.
 320         string shader_src = ::read_file("encoder.shader");
 321         GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
 322         GLuint glsl_program_num = glCreateProgram();
 323         glAttachShader(glsl_program_num, shader_num);
 324         glLinkProgram(glsl_program_num);
 325
 326         GLint success;
 327         glGetProgramiv(glsl_program_num, GL_LINK_STATUS, &success);
 328         if (success == GL_FALSE) {
 329                 GLchar error_log[1024] = {0};
 330                 glGetProgramInfoLog(glsl_program_num, 1024, nullptr, error_log);
 331                 fprintf(stderr, "Error linking program: %s\n", error_log);
 332                 exit(1);
 333         }
 334
 335         glUseProgram(glsl_program_num);
 336
 337         // Upload luma.
 338         GLuint y_tex;
 339         glGenTextures(1, &y_tex);
 340         glBindTexture(GL_TEXTURE_2D, y_tex);
 341         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
 342         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
 343         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
 344         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
 345         glTexImage2D(GL_TEXTURE_2D, 0, GL_R8UI, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_UNSIGNED_BYTE, pix_y);
 346         check_error();
 347
 348         // Make destination textures.
 349         GLuint dc_ac7_tex, ac1_ac6_tex, ac2_ac5_tex;
 350         for (GLuint *tex : { &dc_ac7_tex, &ac1_ac6_tex, &ac2_ac5_tex }) {
 351                 glGenTextures(1, tex);
 352                 glBindTexture(GL_TEXTURE_2D, *tex);
 353                 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
 354                 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
 355                 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
 356                 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
 357                 glTexImage2D(GL_TEXTURE_2D, 0, GL_R16UI, WIDTH / 8, HEIGHT, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, nullptr);
 358                 check_error();
 359         }
 360
 361         GLuint ac3_tex, ac4_tex;
 362         for (GLuint *tex : { &ac3_tex, &ac4_tex }) {
 363                 glGenTextures(1, tex);
 364                 glBindTexture(GL_TEXTURE_2D, *tex);
 365                 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
 366                 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
 367                 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
 368                 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
 369                 glTexImage2D(GL_TEXTURE_2D, 0, GL_R8I, WIDTH / 8, HEIGHT, 0, GL_RED_INTEGER, GL_BYTE, nullptr);
 370                 check_error();
 371         }
 372
 373         GLint dc_ac7_tex_uniform = glGetUniformLocation(glsl_program_num, "dc_ac7_tex");
 374         GLint ac1_ac6_tex_uniform = glGetUniformLocation(glsl_program_num, "ac1_ac6_tex");
 375         GLint ac2_ac5_tex_uniform = glGetUniformLocation(glsl_program_num, "ac2_ac5_tex");
 376         GLint ac3_tex_uniform = glGetUniformLocation(glsl_program_num, "ac3_tex");
 377         GLint ac4_tex_uniform = glGetUniformLocation(glsl_program_num, "ac4_tex");
 378         GLint image_tex_uniform = glGetUniformLocation(glsl_program_num, "image_tex");
 379
 380         glUniform1i(dc_ac7_tex_uniform, 0);
 381         glUniform1i(ac1_ac6_tex_uniform, 1);
 382         glUniform1i(ac2_ac5_tex_uniform, 2);
 383         glUniform1i(ac3_tex_uniform, 3);
 384         glUniform1i(ac4_tex_uniform, 4);
 385         glUniform1i(image_tex_uniform, 5);
 386         glBindImageTexture(0, dc_ac7_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
 387         glBindImageTexture(1, ac1_ac6_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
 388         glBindImageTexture(2, ac2_ac5_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
 389         glBindImageTexture(3, ac3_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I);
 390         glBindImageTexture(4, ac4_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I);
 391         glBindImageTexture(5, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
 392         check_error();
 393
 394         steady_clock::time_point start = steady_clock::now();
 395         unsigned num_iterations = 1000;
 396         for (unsigned i = 0; i < num_iterations; ++i) {
 397                 glDispatchCompute(WIDTH_BLOCKS, HEIGHT_BLOCKS, 1);
 398         }
 399         check_error();
 400         glFinish();
 401         steady_clock::time_point now = steady_clock::now();
 402
 403         // CPU part starts here -- will be GPU later.
 404         // We only do luma for now.
 405
 406         int16_t *coeff_y = new int16_t[WIDTH * HEIGHT];
 407
 408         glBindTexture(GL_TEXTURE_2D, dc_ac7_tex);
 409         uint16_t *dc_ac7_data = new uint16_t[(WIDTH/8) * HEIGHT];
 410         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, dc_ac7_data);
 411         check_error();
 412
 413         glBindTexture(GL_TEXTURE_2D, ac1_ac6_tex);
 414         uint16_t *ac1_ac6_data = new uint16_t[(WIDTH/8) * HEIGHT];
 415         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ac1_ac6_data);
 416         check_error();
 417
 418         glBindTexture(GL_TEXTURE_2D, ac2_ac5_tex);
 419         uint16_t *ac2_ac5_data = new uint16_t[(WIDTH/8) * HEIGHT];
 420         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ac2_ac5_data);
 421         check_error();
 422
 423         glBindTexture(GL_TEXTURE_2D, ac3_tex);
 424         int8_t *ac3_data = new int8_t[(WIDTH/8) * HEIGHT];
 425         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_BYTE, ac3_data);
 426         check_error();
 427
 428         glBindTexture(GL_TEXTURE_2D, ac4_tex);
 429         int8_t *ac4_data = new int8_t[(WIDTH/8) * HEIGHT];
 430         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_BYTE, ac4_data);
 431         check_error();
 432
 433         for (unsigned y = 0; y < HEIGHT; ++y) {
 434                 for (unsigned xb = 0; xb < WIDTH/8; ++xb) {
 435                         coeff_y[y * WIDTH + xb*8 + 0] = int(dc_ac7_data[y * (WIDTH/8) + xb] << 23) >> 23;
 436                         coeff_y[y * WIDTH + xb*8 + 7] = int(dc_ac7_data[y * (WIDTH/8) + xb] << 16) >> 25;
 437                         coeff_y[y * WIDTH + xb*8 + 1] = int(ac1_ac6_data[y * (WIDTH/8) + xb] << 23) >> 23;
 438                         coeff_y[y * WIDTH + xb*8 + 6] = int(ac1_ac6_data[y * (WIDTH/8) + xb] << 16) >> 25;
 439                         coeff_y[y * WIDTH + xb*8 + 2] = int(ac2_ac5_data[y * (WIDTH/8) + xb] << 23) >> 23;
 440                         coeff_y[y * WIDTH + xb*8 + 5] = int(ac2_ac5_data[y * (WIDTH/8) + xb] << 16) >> 25;
 441                         coeff_y[y * WIDTH + xb*8 + 3] = ac3_data[y * (WIDTH/8) + xb];
 442                         coeff_y[y * WIDTH + xb*8 + 4] = ac4_data[y * (WIDTH/8) + xb];
 443                 }
 444         }
 445
 446 #if 1
 447         for (unsigned y = 0; y < HEIGHT; ++y) {
 448                 for (unsigned xb = 0; xb < WIDTH/8; ++xb) {
 449                         printf("%4d %4d %4d %4d %4d %4d %4d %4d | ",
 450                                 coeff_y[y * WIDTH + xb*8 + 0],
 451                                 coeff_y[y * WIDTH + xb*8 + 1],
 452                                 coeff_y[y * WIDTH + xb*8 + 2],
 453                                 coeff_y[y * WIDTH + xb*8 + 3],
 454                                 coeff_y[y * WIDTH + xb*8 + 4],
 455                                 coeff_y[y * WIDTH + xb*8 + 5],
 456                                 coeff_y[y * WIDTH + xb*8 + 6],
 457                                 coeff_y[y * WIDTH + xb*8 + 7]);
 458                 }
 459                 printf("\n");
 460         }
 461 #endif
 462
 463         // DC coefficient pred from the right to left (within each slice)
 464         for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; block_idx += BLOCKS_PER_STREAM) {
 465                 int prev_k = 128;
 466
 467                 for (unsigned subblock_idx = BLOCKS_PER_STREAM; subblock_idx --> 0; ) {
 468                         unsigned yb = (block_idx + subblock_idx) / WIDTH_BLOCKS;
 469                         unsigned xb = (block_idx + subblock_idx) % WIDTH_BLOCKS;
 470                         int k = coeff_y[(yb * 8) * WIDTH + xb * 8];
 471
 472                         coeff_y[(yb * 8) * WIDTH + xb * 8] = k - prev_k;
 473
 474                         prev_k = k;
 475                 }
 476         }
 477
 478         // For each coefficient, make some tables.
 479         size_t extra_bits = 0;
 480         for (unsigned i = 0; i < 64; ++i) {
 481                 stats[i].clear();
 482         }
 483         for (unsigned y = 0; y < 8; ++y) {
 484                 for (unsigned x = 0; x < 8; ++x) {
 485                         SymbolStats &s_luma = stats[pick_stats_for(x, y)];
 486
 487                         // Luma
 488                         for (unsigned yb = 0; yb < HEIGHT; yb += 8) {
 489                                 for (unsigned xb = 0; xb < WIDTH; xb += 8) {
 490                                         unsigned short k = abs(coeff_y[(yb + y) * WIDTH + (xb + x)]);
 491                                         if (k >= ESCAPE_LIMIT) {
 492                                                 k = ESCAPE_LIMIT;
 493                                                 extra_bits += 12;  // escape this one
 494                                         }
 495                                         ++s_luma.freqs[(k - 1) & (NUM_SYMS - 1)];
 496                                 }
 497                         }
 498                 }
 499         }
 500
 501         for (unsigned i = 0; i < 64; ++i) {
 502                 stats[i].freqs[NUM_SYMS - 1] /= 2;  // zero, has no sign bits (yes, this is trickery)
 503                 stats[i].normalize_freqs(prob_scale);
 504                 stats[i].cum_freqs[NUM_SYMS] += stats[i].freqs[NUM_SYMS - 1];
 505                 stats[i].freqs[NUM_SYMS - 1] *= 2;
 506         }
 507
 508         FILE *codedfp = fopen("coded.dat", "wb");
 509         if (codedfp == nullptr) {
 510                 perror("coded.dat");
 511                 exit(1);
 512         }
 513
 514         for (unsigned r = 0; r < 2; ++r) {  // Hack to write fake chroma tables.
 515                 // TODO: rather gamma-k or something
 516                 for (unsigned i = 0; i < 64; ++i) {
 517                         if (stats[i].cum_freqs[NUM_SYMS] == 0) {
 518                                 continue;
 519                         }
 520                         printf("writing table %d\n", i);
 521                         for (unsigned j = 0; j < NUM_SYMS; ++j) {
 522                                 write_varint(stats[i].freqs[j], codedfp);
 523                         }
 524                 }
 525         }
 526
 527         RansEncoder rans_encoder;
 528
 529         size_t tot_bytes = 0;
 530
 531         // Luma
 532         for (unsigned y = 0; y < 8; ++y) {
 533                 for (unsigned x = 0; x < 8; ++x) {
 534                         SymbolStats &s_luma = stats[pick_stats_for(x, y)];
 535                         rans_encoder.init_prob(s_luma);
 536
 537                         // Luma
 538                         std::vector<int> lens;
 539
 540                         rans_encoder.clear();
 541                         size_t num_bytes = 0;
 542                         for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; ++block_idx) {
 543                                 unsigned yb = block_idx / WIDTH_BLOCKS;
 544                                 unsigned xb = block_idx % WIDTH_BLOCKS;
 545
 546                                 int k = coeff_y[(yb * 8 + y) * WIDTH + (xb * 8 + x)];
 547                                 rans_encoder.encode_coeff(k);
 548
 549                                 if (block_idx % BLOCKS_PER_STREAM == (BLOCKS_PER_STREAM - 1) || block_idx == NUM_BLOCKS - 1) {
 550                                         int l = rans_encoder.save_block(codedfp);
 551                                         num_bytes += l;
 552                                         lens.push_back(l);
 553                                 }
 554                         }
 555                         tot_bytes += num_bytes;
 556                         printf("coeff %d Y': %ld bytes\n", y * 8 + x, num_bytes);
 557                 }
 558         }
 559
 560         printf("%ld bytes + %ld escape bits (%ld) = %ld total bytes\n",
 561                 tot_bytes - extra_bits / 8,
 562                 extra_bits,
 563                 extra_bits / 8,
 564                 tot_bytes);
 565
 566         printf("\n");
 567         printf("Each iteration took %.3f ms (but note that is DCT only, no rANS).\n", 1e3 * duration<double>(now - start).count() / num_iterations);
 568
 569 }