--- /dev/null
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include <SDL2/SDL.h>
+#include <SDL2/SDL_error.h>
+#include <SDL2/SDL_video.h>
+#include <epoxy/gl.h>
+
+#include <algorithm>
+#include <chrono>
+#include <memory>
+#include <numeric>
+#include <random>
+#include <vector>
+#include <unordered_map>
+
+#include <movit/util.h>
+
+#include "ryg_rans/rans_byte.h"
+#include "ryg_rans/renormalize.h"
+#include "util.h"
+
+#define WIDTH 1280
+#define HEIGHT 720
+#define WIDTH_BLOCKS (WIDTH/8)
+#define WIDTH_BLOCKS_CHROMA (WIDTH/16)
+#define HEIGHT_BLOCKS (HEIGHT/8)
+#define NUM_BLOCKS (WIDTH_BLOCKS * HEIGHT_BLOCKS)
+#define NUM_BLOCKS_CHROMA (WIDTH_BLOCKS_CHROMA * HEIGHT_BLOCKS)
+
+#define NUM_SYMS 256
+#define ESCAPE_LIMIT (NUM_SYMS - 1)
+#define BLOCKS_PER_STREAM 320
+
+static constexpr uint32_t prob_bits = 12;
+static constexpr uint32_t prob_scale = 1 << prob_bits;
+
+unsigned char rgb[WIDTH * HEIGHT * 3];
+unsigned char pix_y[WIDTH * HEIGHT];
+unsigned char pix_cb[(WIDTH/2) * HEIGHT];
+unsigned char pix_cr[(WIDTH/2) * HEIGHT];
+
+using namespace std;
+using namespace std::chrono;
+
+void write_varint(int x, FILE *fp)
+{
+ while (x >= 128) {
+ putc((x & 0x7f) | 0x80, fp);
+ x >>= 7;
+ }
+ putc(x, fp);
+}
+
+void readpix(unsigned char *ptr, const char *filename)
+{
+ FILE *fp = fopen(filename, "rb");
+ if (fp == nullptr) {
+ perror(filename);
+ exit(1);
+ }
+
+ fseek(fp, 0, SEEK_END);
+ long len = ftell(fp);
+ assert(len >= WIDTH * HEIGHT * 3);
+ fseek(fp, len - WIDTH * HEIGHT * 3, SEEK_SET);
+
+ fread(ptr, 1, WIDTH * HEIGHT * 3, fp);
+ fclose(fp);
+}
+
+struct SymbolStats
+{
+ uint32_t freqs[NUM_SYMS];
+ uint32_t cum_freqs[NUM_SYMS + 1];
+
+ void clear();
+ void calc_cum_freqs();
+ void normalize_freqs(uint32_t target_total);
+};
+
+void SymbolStats::clear()
+{
+ for (int i=0; i < NUM_SYMS; i++)
+ freqs[i] = 0;
+}
+
+void SymbolStats::calc_cum_freqs()
+{
+ cum_freqs[0] = 0;
+ for (int i=0; i < NUM_SYMS; i++)
+ cum_freqs[i+1] = cum_freqs[i] + freqs[i];
+}
+
+void SymbolStats::normalize_freqs(uint32_t target_total)
+{
+ uint64_t real_freq[NUM_SYMS + 1]; // hack
+
+ assert(target_total >= NUM_SYMS);
+
+ calc_cum_freqs();
+ uint32_t cur_total = cum_freqs[NUM_SYMS];
+
+ if (cur_total == 0) return;
+
+ double ideal_cost = 0.0;
+ for (int i = 1; i <= NUM_SYMS; i++)
+ {
+ real_freq[i] = cum_freqs[i] - cum_freqs[i - 1];
+ if (real_freq[i] > 0)
+ ideal_cost -= real_freq[i] * log2(real_freq[i] / double(cur_total));
+ }
+
+ OptimalRenormalize(cum_freqs, NUM_SYMS, prob_scale);
+
+ // calculate updated freqs and make sure we didn't screw anything up
+ assert(cum_freqs[0] == 0 && cum_freqs[NUM_SYMS] == target_total);
+ for (int i=0; i < NUM_SYMS; i++) {
+ if (freqs[i] == 0)
+ assert(cum_freqs[i+1] == cum_freqs[i]);
+ else
+ assert(cum_freqs[i+1] > cum_freqs[i]);
+
+ // calc updated freq
+ freqs[i] = cum_freqs[i+1] - cum_freqs[i];
+ }
+
+ double calc_cost = 0.0;
+ for (int i = 1; i <= NUM_SYMS; i++)
+ {
+ uint64_t freq = cum_freqs[i] - cum_freqs[i - 1];
+ if (real_freq[i] > 0)
+ calc_cost -= real_freq[i] * log2(freq / double(target_total));
+ }
+
+ static double total_loss = 0.0;
+ total_loss += calc_cost - ideal_cost;
+ static double total_loss_with_dp = 0.0;
+ double optimal_cost = 0.0;
+ //total_loss_with_dp += optimal_cost - ideal_cost;
+ printf("ideal cost = %.0f bits, DP cost = %.0f bits, calc cost = %.0f bits (loss = %.2f bytes, total loss = %.2f bytes, total loss with DP = %.2f bytes)\n",
+ ideal_cost, optimal_cost,
+ calc_cost, (calc_cost - ideal_cost) / 8.0, total_loss / 8.0, total_loss_with_dp / 8.0);
+}
+
+SymbolStats stats[128];
+
+const int luma_mapping[64] = {
+ 0, 0, 1, 1, 2, 2, 3, 3,
+ 0, 0, 1, 2, 2, 2, 3, 3,
+ 1, 1, 2, 2, 2, 3, 3, 3,
+ 1, 1, 2, 2, 2, 3, 3, 3,
+ 1, 2, 2, 2, 2, 3, 3, 3,
+ 2, 2, 2, 2, 3, 3, 3, 3,
+ 2, 2, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+int pick_stats_for(int x, int y)
+{
+ return luma_mapping[y * 8 + x];
+}
+
+class RansEncoder {
+public:
+ RansEncoder()
+ {
+ out_buf.reset(new uint8_t[out_max_size]);
+ clear();
+ }
+
+ void init_prob(SymbolStats &s)
+ {
+ for (int i = 0; i < NUM_SYMS; i++) {
+ //printf("%d: cumfreqs=%d freqs=%d prob_bits=%d\n", i, s.cum_freqs[i], s.freqs[i], prob_bits + 1);
+ RansEncSymbolInit(&esyms[i], s.cum_freqs[i], s.freqs[i], prob_bits + 1);
+ }
+ sign_bias = s.cum_freqs[NUM_SYMS];
+ }
+
+ void clear()
+ {
+ out_end = out_buf.get() + out_max_size;
+ ptr = out_end; // *end* of output buffer
+ RansEncInit(&rans);
+ }
+
+ uint32_t save_block(FILE *codedfp) // Returns number of bytes.
+ {
+ RansEncFlush(&rans, &ptr);
+ //printf("post-flush = %08x\n", rans);
+
+ uint32_t num_rans_bytes = out_end - ptr;
+ if (num_rans_bytes == last_block.size() &&
+ memcmp(last_block.data(), ptr, last_block.size()) == 0) {
+ write_varint(0, codedfp);
+ clear();
+ return 1;
+ } else {
+ last_block = string((const char *)ptr, num_rans_bytes);
+ }
+
+ write_varint(num_rans_bytes, codedfp);
+ //fwrite(&num_rans_bytes, 1, 4, codedfp);
+ fwrite(ptr, 1, num_rans_bytes, codedfp);
+
+ //printf("first rANS bytes: %02x %02x %02x %02x %02x %02x %02x %02x\n", ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7]);
+
+
+ clear();
+
+ //printf("Saving block: %d rANS bytes\n", num_rans_bytes);
+ return num_rans_bytes;
+ //return num_rans_bytes;
+ }
+
+ void encode_coeff(short signed_k)
+ {
+ //printf("encoding coeff %d (sym %d), rans before encoding = %08x\n", signed_k, ((abs(signed_k) - 1) & 255), rans);
+ unsigned short k = abs(signed_k);
+ if (k >= ESCAPE_LIMIT) {
+ // Put the coefficient as a 1/(2^12) symbol _before_
+ // the 255 coefficient, since the decoder will read the
+ // 255 coefficient first.
+ RansEncPut(&rans, &ptr, k, 1, prob_bits);
+ k = ESCAPE_LIMIT;
+ }
+ RansEncPutSymbol(&rans, &ptr, &esyms[(k - 1) & (NUM_SYMS - 1)]);
+ if (signed_k < 0) {
+ rans += sign_bias;
+ }
+ }
+
+private:
+ static constexpr size_t out_max_size = 32 << 20; // 32 MB.
+ static constexpr size_t max_num_sign = 1048576; // Way too big. And actually bytes.
+
+ unique_ptr<uint8_t[]> out_buf;
+ uint8_t *out_end;
+ uint8_t *ptr;
+ RansState rans;
+ RansEncSymbol esyms[NUM_SYMS];
+ uint32_t sign_bias;
+
+ std::string last_block;
+};
+
+// Should be done on the GPU, of course, but irrelevant for the demonstration.
+void convert_ycbcr()
+{
+ double coeff[3] = { 0.2126, 0.7152, 0.0722 }; // sum = 1.0
+ double cb_fac = 1.0 / (coeff[0] + coeff[1] + 1.0f - coeff[2]); // 0.539
+ double cr_fac = 1.0 / (1.0f - coeff[0] + coeff[1] + coeff[2]); // 0.635
+
+ unique_ptr<float[]> temp_cb(new float[WIDTH * HEIGHT]);
+ unique_ptr<float[]> temp_cr(new float[WIDTH * HEIGHT]);
+ for (unsigned yb = 0; yb < HEIGHT; ++yb) {
+ for (unsigned xb = 0; xb < WIDTH; ++xb) {
+ int r = rgb[((yb * WIDTH) + xb) * 3 + 0];
+ int g = rgb[((yb * WIDTH) + xb) * 3 + 1];
+ int b = rgb[((yb * WIDTH) + xb) * 3 + 2];
+ double y = std::min(std::max(coeff[0] * r + coeff[1] * g + coeff[2] * b, 0.0), 255.0);
+ double cb = (b - y) * cb_fac + 128.0;
+ double cr = (r - y) * cr_fac + 128.0;
+ pix_y[(yb * WIDTH) + xb] = lrint(y);
+ temp_cb[(yb * WIDTH) + xb] = cb;
+ temp_cr[(yb * WIDTH) + xb] = cr;
+ }
+ }
+
+ // Simple 4:2:2 subsampling with left convention.
+ for (unsigned yb = 0; yb < HEIGHT; ++yb) {
+ for (unsigned xb = 0; xb < WIDTH / 2; ++xb) {
+ int c0 = yb * WIDTH + std::max(int(xb) * 2 - 1, 0);
+ int c1 = yb * WIDTH + xb * 2;
+ int c2 = yb * WIDTH + xb * 2 + 1;
+
+ double cb = 0.25 * temp_cb[c0] + 0.5 * temp_cb[c1] + 0.25 * temp_cb[c2];
+ double cr = 0.25 * temp_cr[c0] + 0.5 * temp_cr[c1] + 0.25 * temp_cr[c2];
+ cb = std::min(std::max(cb, 0.0), 255.0);
+ cr = std::min(std::max(cr, 0.0), 255.0);
+ pix_cb[(yb * WIDTH/2) + xb] = lrint(cb);
+ pix_cr[(yb * WIDTH/2) + xb] = lrint(cr);
+ }
+ }
+}
+
+int main(int argc, char **argv)
+{
+ // Set up an OpenGL context using SDL.
+ if (SDL_Init(SDL_INIT_VIDEO) == -1) {
+ fprintf(stderr, "SDL_Init failed: %s\n", SDL_GetError());
+ exit(1);
+ }
+ SDL_GL_SetAttribute(SDL_GL_DEPTH_SIZE, 0);
+ SDL_GL_SetAttribute(SDL_GL_STENCIL_SIZE, 0);
+ SDL_GL_SetAttribute(SDL_GL_DOUBLEBUFFER, 1);
+ SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
+ SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
+ SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 5);
+
+ SDL_Window *window = SDL_CreateWindow("OpenGL window for unit test",
+ SDL_WINDOWPOS_UNDEFINED,
+ SDL_WINDOWPOS_UNDEFINED,
+ 32, 32,
+ SDL_WINDOW_OPENGL);
+ SDL_GLContext context = SDL_GL_CreateContext(window);
+ assert(context != nullptr);
+
+ if (argc >= 2)
+ readpix(rgb, argv[1]);
+ else
+ readpix(rgb, "color.pnm");
+ convert_ycbcr();
+
+ // Compile the shader.
+ string shader_src = ::read_file("encoder.shader");
+ GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
+ GLuint glsl_program_num = glCreateProgram();
+ glAttachShader(glsl_program_num, shader_num);
+ glLinkProgram(glsl_program_num);
+
+ GLint success;
+ glGetProgramiv(glsl_program_num, GL_LINK_STATUS, &success);
+ if (success == GL_FALSE) {
+ GLchar error_log[1024] = {0};
+ glGetProgramInfoLog(glsl_program_num, 1024, nullptr, error_log);
+ fprintf(stderr, "Error linking program: %s\n", error_log);
+ exit(1);
+ }
+
+ glUseProgram(glsl_program_num);
+
+ // Upload luma.
+ GLuint y_tex;
+ glGenTextures(1, &y_tex);
+ glBindTexture(GL_TEXTURE_2D, y_tex);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_R8I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_UNSIGNED_BYTE, pix_y);
+ check_error();
+
+ // Make destination textures.
+ GLuint dc_ac7_tex, ac1_ac6_tex, ac2_ac5_tex;
+ for (GLuint *tex : { &dc_ac7_tex, &ac1_ac6_tex, &ac2_ac5_tex }) {
+ glGenTextures(1, tex);
+ glBindTexture(GL_TEXTURE_2D, *tex);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_R16UI, WIDTH / 8, HEIGHT, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, nullptr);
+ check_error();
+ }
+
+ GLuint ac3_tex, ac4_tex;
+ for (GLuint *tex : { &ac3_tex, &ac4_tex }) {
+ glGenTextures(1, tex);
+ glBindTexture(GL_TEXTURE_2D, *tex);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_R8I, WIDTH / 8, HEIGHT, 0, GL_RED_INTEGER, GL_BYTE, nullptr);
+ check_error();
+ }
+
+ GLint dc_ac7_tex_uniform = glGetUniformLocation(glsl_program_num, "dc_ac7_tex");
+ GLint ac1_ac6_tex_uniform = glGetUniformLocation(glsl_program_num, "ac1_ac6_tex");
+ GLint ac2_ac5_tex_uniform = glGetUniformLocation(glsl_program_num, "ac2_ac5_tex");
+ GLint ac3_tex_uniform = glGetUniformLocation(glsl_program_num, "ac3_tex");
+ GLint ac4_tex_uniform = glGetUniformLocation(glsl_program_num, "ac4_tex");
+ GLint image_tex_uniform = glGetUniformLocation(glsl_program_num, "image_tex");
+
+ glUniform1i(dc_ac7_tex_uniform, 0);
+ glUniform1i(ac1_ac6_tex_uniform, 1);
+ glUniform1i(ac2_ac5_tex_uniform, 2);
+ glUniform1i(ac3_tex_uniform, 3);
+ glUniform1i(ac4_tex_uniform, 4);
+ glUniform1i(image_tex_uniform, 5);
+ glBindImageTexture(0, dc_ac7_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
+ glBindImageTexture(1, ac1_ac6_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
+ glBindImageTexture(2, ac2_ac5_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
+ glBindImageTexture(3, ac3_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I);
+ glBindImageTexture(4, ac4_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I);
+ glBindImageTexture(5, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I);
+ check_error();
+
+ steady_clock::time_point start = steady_clock::now();
+ unsigned num_iterations = 1000;
+ for (unsigned i = 0; i < num_iterations; ++i) {
+ glDispatchCompute(WIDTH_BLOCKS, HEIGHT_BLOCKS, 1);
+ }
+ check_error();
+ glFinish();
+ steady_clock::time_point now = steady_clock::now();
+
+ // CPU part starts here -- will be GPU later.
+ // We only do luma for now.
+
+ int16_t *coeff_y = new int16_t[WIDTH * HEIGHT];
+
+ glBindTexture(GL_TEXTURE_2D, dc_ac7_tex);
+ uint16_t *dc_ac7_data = new uint16_t[(WIDTH/8) * HEIGHT];
+ glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, dc_ac7_data);
+ check_error();
+
+ glBindTexture(GL_TEXTURE_2D, ac1_ac6_tex);
+ uint16_t *ac1_ac6_data = new uint16_t[(WIDTH/8) * HEIGHT];
+ glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ac1_ac6_data);
+ check_error();
+
+ glBindTexture(GL_TEXTURE_2D, ac2_ac5_tex);
+ uint16_t *ac2_ac5_data = new uint16_t[(WIDTH/8) * HEIGHT];
+ glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ac2_ac5_data);
+ check_error();
+
+ glBindTexture(GL_TEXTURE_2D, ac3_tex);
+ int8_t *ac3_data = new int8_t[(WIDTH/8) * HEIGHT];
+ glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_BYTE, ac3_data);
+ check_error();
+
+ glBindTexture(GL_TEXTURE_2D, ac4_tex);
+ int8_t *ac4_data = new int8_t[(WIDTH/8) * HEIGHT];
+ glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_BYTE, ac4_data);
+ check_error();
+
+ for (unsigned y = 0; y < HEIGHT; ++y) {
+ for (unsigned xb = 0; xb < WIDTH/8; ++xb) {
+ coeff_y[y * WIDTH + xb*8 + 0] = int(dc_ac7_data[y * (WIDTH/8) + xb] << 23) >> 23;
+ coeff_y[y * WIDTH + xb*8 + 7] = int(dc_ac7_data[y * (WIDTH/8) + xb] << 16) >> 25;
+ coeff_y[y * WIDTH + xb*8 + 1] = int(ac1_ac6_data[y * (WIDTH/8) + xb] << 23) >> 23;
+ coeff_y[y * WIDTH + xb*8 + 6] = int(ac1_ac6_data[y * (WIDTH/8) + xb] << 16) >> 25;
+ coeff_y[y * WIDTH + xb*8 + 2] = int(ac2_ac5_data[y * (WIDTH/8) + xb] << 23) >> 23;
+ coeff_y[y * WIDTH + xb*8 + 5] = int(ac2_ac5_data[y * (WIDTH/8) + xb] << 16) >> 25;
+ coeff_y[y * WIDTH + xb*8 + 3] = ac3_data[y * (WIDTH/8) + xb];
+ coeff_y[y * WIDTH + xb*8 + 4] = ac4_data[y * (WIDTH/8) + xb];
+ }
+ }
+
+#if 1
+ for (unsigned y = 0; y < HEIGHT; ++y) {
+ for (unsigned xb = 0; xb < WIDTH/8; ++xb) {
+ printf("%4d %4d %4d %4d %4d %4d %4d %4d | ",
+ coeff_y[y * WIDTH + xb*8 + 0],
+ coeff_y[y * WIDTH + xb*8 + 1],
+ coeff_y[y * WIDTH + xb*8 + 2],
+ coeff_y[y * WIDTH + xb*8 + 3],
+ coeff_y[y * WIDTH + xb*8 + 4],
+ coeff_y[y * WIDTH + xb*8 + 5],
+ coeff_y[y * WIDTH + xb*8 + 6],
+ coeff_y[y * WIDTH + xb*8 + 7]);
+ }
+ printf("\n");
+ }
+#endif
+
+ // DC coefficient pred from the right to left (within each slice)
+ for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; block_idx += BLOCKS_PER_STREAM) {
+ int prev_k = 128;
+
+ for (unsigned subblock_idx = BLOCKS_PER_STREAM; subblock_idx --> 0; ) {
+ unsigned yb = (block_idx + subblock_idx) / WIDTH_BLOCKS;
+ unsigned xb = (block_idx + subblock_idx) % WIDTH_BLOCKS;
+ int k = coeff_y[(yb * 8) * WIDTH + xb * 8];
+
+ coeff_y[(yb * 8) * WIDTH + xb * 8] = k - prev_k;
+
+ prev_k = k;
+ }
+ }
+
+ // For each coefficient, make some tables.
+ size_t extra_bits = 0;
+ for (unsigned i = 0; i < 64; ++i) {
+ stats[i].clear();
+ }
+ for (unsigned y = 0; y < 8; ++y) {
+ for (unsigned x = 0; x < 8; ++x) {
+ SymbolStats &s_luma = stats[pick_stats_for(x, y)];
+
+ // Luma
+ for (unsigned yb = 0; yb < HEIGHT; yb += 8) {
+ for (unsigned xb = 0; xb < WIDTH; xb += 8) {
+ unsigned short k = abs(coeff_y[(yb + y) * WIDTH + (xb + x)]);
+ if (k >= ESCAPE_LIMIT) {
+ k = ESCAPE_LIMIT;
+ extra_bits += 12; // escape this one
+ }
+ ++s_luma.freqs[(k - 1) & (NUM_SYMS - 1)];
+ }
+ }
+ }
+ }
+
+ for (unsigned i = 0; i < 64; ++i) {
+ stats[i].freqs[NUM_SYMS - 1] /= 2; // zero, has no sign bits (yes, this is trickery)
+ stats[i].normalize_freqs(prob_scale);
+ stats[i].cum_freqs[NUM_SYMS] += stats[i].freqs[NUM_SYMS - 1];
+ stats[i].freqs[NUM_SYMS - 1] *= 2;
+ }
+
+ FILE *codedfp = fopen("coded.dat", "wb");
+ if (codedfp == nullptr) {
+ perror("coded.dat");
+ exit(1);
+ }
+
+ for (unsigned r = 0; r < 2; ++r) { // Hack to write fake chroma tables.
+ // TODO: rather gamma-k or something
+ for (unsigned i = 0; i < 64; ++i) {
+ if (stats[i].cum_freqs[NUM_SYMS] == 0) {
+ continue;
+ }
+ printf("writing table %d\n", i);
+ for (unsigned j = 0; j < NUM_SYMS; ++j) {
+ write_varint(stats[i].freqs[j], codedfp);
+ }
+ }
+ }
+
+ RansEncoder rans_encoder;
+
+ size_t tot_bytes = 0;
+
+ // Luma
+ for (unsigned y = 0; y < 8; ++y) {
+ for (unsigned x = 0; x < 8; ++x) {
+ SymbolStats &s_luma = stats[pick_stats_for(x, y)];
+ rans_encoder.init_prob(s_luma);
+
+ // Luma
+ std::vector<int> lens;
+
+ rans_encoder.clear();
+ size_t num_bytes = 0;
+ for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; ++block_idx) {
+ unsigned yb = block_idx / WIDTH_BLOCKS;
+ unsigned xb = block_idx % WIDTH_BLOCKS;
+
+ int k = coeff_y[(yb * 8 + y) * WIDTH + (xb * 8 + x)];
+ rans_encoder.encode_coeff(k);
+
+ if (block_idx % BLOCKS_PER_STREAM == (BLOCKS_PER_STREAM - 1) || block_idx == NUM_BLOCKS - 1) {
+ int l = rans_encoder.save_block(codedfp);
+ num_bytes += l;
+ lens.push_back(l);
+ }
+ }
+ tot_bytes += num_bytes;
+ printf("coeff %d Y': %ld bytes\n", y * 8 + x, num_bytes);
+ }
+ }
+
+ printf("%ld bytes + %ld escape bits (%ld) = %ld total bytes\n",
+ tot_bytes - extra_bits / 8,
+ extra_bits,
+ extra_bits / 8,
+ tot_bytes);
+
+ printf("\n");
+ printf("Each iteration took %.3f ms (but note that is DCT only, no rANS).\n", 1e3 * duration<double>(now - start).count() / num_iterations);
+
+}