8 #include <SDL2/SDL_error.h>
9 #include <SDL2/SDL_video.h>
18 #include <unordered_map>
20 #include <movit/util.h>
22 #include "ryg_rans/rans_byte.h"
23 #include "ryg_rans/renormalize.h"
28 #define WIDTH_BLOCKS (WIDTH/8)
29 #define WIDTH_BLOCKS_CHROMA (WIDTH/16)
30 #define HEIGHT_BLOCKS (HEIGHT/8)
31 #define NUM_BLOCKS (WIDTH_BLOCKS * HEIGHT_BLOCKS)
32 #define NUM_BLOCKS_CHROMA (WIDTH_BLOCKS_CHROMA * HEIGHT_BLOCKS)
35 #define ESCAPE_LIMIT (NUM_SYMS - 1)
36 #define BLOCKS_PER_STREAM 320
38 static constexpr uint32_t prob_bits = 12;
39 static constexpr uint32_t prob_scale = 1 << prob_bits;
41 unsigned char rgb[WIDTH * HEIGHT * 3];
42 unsigned char pix_y[WIDTH * HEIGHT];
43 unsigned char pix_cb[(WIDTH/2) * HEIGHT];
44 unsigned char pix_cr[(WIDTH/2) * HEIGHT];
47 using namespace std::chrono;
49 void write_varint(int x, FILE *fp)
52 putc((x & 0x7f) | 0x80, fp);
58 void readpix(unsigned char *ptr, const char *filename)
60 FILE *fp = fopen(filename, "rb");
66 fseek(fp, 0, SEEK_END);
68 assert(len >= WIDTH * HEIGHT * 3);
69 fseek(fp, len - WIDTH * HEIGHT * 3, SEEK_SET);
71 fread(ptr, 1, WIDTH * HEIGHT * 3, fp);
77 uint32_t freqs[NUM_SYMS];
78 uint32_t cum_freqs[NUM_SYMS + 1];
81 void calc_cum_freqs();
82 void normalize_freqs(uint32_t target_total);
85 void SymbolStats::clear()
87 for (int i=0; i < NUM_SYMS; i++)
91 void SymbolStats::calc_cum_freqs()
94 for (int i=0; i < NUM_SYMS; i++)
95 cum_freqs[i+1] = cum_freqs[i] + freqs[i];
98 void SymbolStats::normalize_freqs(uint32_t target_total)
100 uint64_t real_freq[NUM_SYMS + 1]; // hack
102 assert(target_total >= NUM_SYMS);
105 uint32_t cur_total = cum_freqs[NUM_SYMS];
107 if (cur_total == 0) return;
109 double ideal_cost = 0.0;
110 for (int i = 1; i <= NUM_SYMS; i++)
112 real_freq[i] = cum_freqs[i] - cum_freqs[i - 1];
113 if (real_freq[i] > 0)
114 ideal_cost -= real_freq[i] * log2(real_freq[i] / double(cur_total));
117 OptimalRenormalize(cum_freqs, NUM_SYMS, prob_scale);
119 // calculate updated freqs and make sure we didn't screw anything up
120 assert(cum_freqs[0] == 0 && cum_freqs[NUM_SYMS] == target_total);
121 for (int i=0; i < NUM_SYMS; i++) {
123 assert(cum_freqs[i+1] == cum_freqs[i]);
125 assert(cum_freqs[i+1] > cum_freqs[i]);
128 freqs[i] = cum_freqs[i+1] - cum_freqs[i];
131 double calc_cost = 0.0;
132 for (int i = 1; i <= NUM_SYMS; i++)
134 uint64_t freq = cum_freqs[i] - cum_freqs[i - 1];
135 if (real_freq[i] > 0)
136 calc_cost -= real_freq[i] * log2(freq / double(target_total));
139 static double total_loss = 0.0;
140 total_loss += calc_cost - ideal_cost;
141 static double total_loss_with_dp = 0.0;
142 double optimal_cost = 0.0;
143 //total_loss_with_dp += optimal_cost - ideal_cost;
144 printf("ideal cost = %.0f bits, DP cost = %.0f bits, calc cost = %.0f bits (loss = %.2f bytes, total loss = %.2f bytes, total loss with DP = %.2f bytes)\n",
145 ideal_cost, optimal_cost,
146 calc_cost, (calc_cost - ideal_cost) / 8.0, total_loss / 8.0, total_loss_with_dp / 8.0);
149 SymbolStats stats[128];
151 const int luma_mapping[64] = {
152 0, 0, 1, 1, 2, 2, 3, 3,
153 0, 0, 1, 2, 2, 2, 3, 3,
154 1, 1, 2, 2, 2, 3, 3, 3,
155 1, 1, 2, 2, 2, 3, 3, 3,
156 1, 2, 2, 2, 2, 3, 3, 3,
157 2, 2, 2, 2, 3, 3, 3, 3,
158 2, 2, 3, 3, 3, 3, 3, 3,
159 3, 3, 3, 3, 3, 3, 3, 3,
162 int pick_stats_for(int x, int y)
164 return luma_mapping[y * 8 + x];
171 out_buf.reset(new uint8_t[out_max_size]);
175 void init_prob(SymbolStats &s)
177 for (int i = 0; i < NUM_SYMS; i++) {
178 //printf("%d: cumfreqs=%d freqs=%d prob_bits=%d\n", i, s.cum_freqs[i], s.freqs[i], prob_bits + 1);
179 RansEncSymbolInit(&esyms[i], s.cum_freqs[i], s.freqs[i], prob_bits + 1);
181 sign_bias = s.cum_freqs[NUM_SYMS];
186 out_end = out_buf.get() + out_max_size;
187 ptr = out_end; // *end* of output buffer
191 uint32_t save_block(FILE *codedfp) // Returns number of bytes.
193 RansEncFlush(&rans, &ptr);
194 //printf("post-flush = %08x\n", rans);
196 uint32_t num_rans_bytes = out_end - ptr;
197 if (num_rans_bytes == last_block.size() &&
198 memcmp(last_block.data(), ptr, last_block.size()) == 0) {
199 write_varint(0, codedfp);
203 last_block = string((const char *)ptr, num_rans_bytes);
206 write_varint(num_rans_bytes, codedfp);
207 //fwrite(&num_rans_bytes, 1, 4, codedfp);
208 fwrite(ptr, 1, num_rans_bytes, codedfp);
210 //printf("first rANS bytes: %02x %02x %02x %02x %02x %02x %02x %02x\n", ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7]);
215 //printf("Saving block: %d rANS bytes\n", num_rans_bytes);
216 return num_rans_bytes;
217 //return num_rans_bytes;
220 void encode_coeff(short signed_k)
222 //printf("encoding coeff %d (sym %d), rans before encoding = %08x\n", signed_k, ((abs(signed_k) - 1) & 255), rans);
223 unsigned short k = abs(signed_k);
224 if (k >= ESCAPE_LIMIT) {
225 // Put the coefficient as a 1/(2^12) symbol _before_
226 // the 255 coefficient, since the decoder will read the
227 // 255 coefficient first.
228 RansEncPut(&rans, &ptr, k, 1, prob_bits);
231 RansEncPutSymbol(&rans, &ptr, &esyms[(k - 1) & (NUM_SYMS - 1)]);
238 static constexpr size_t out_max_size = 32 << 20; // 32 MB.
239 static constexpr size_t max_num_sign = 1048576; // Way too big. And actually bytes.
241 unique_ptr<uint8_t[]> out_buf;
245 RansEncSymbol esyms[NUM_SYMS];
248 std::string last_block;
251 // Should be done on the GPU, of course, but irrelevant for the demonstration.
254 double coeff[3] = { 0.2126, 0.7152, 0.0722 }; // sum = 1.0
255 double cb_fac = 1.0 / (coeff[0] + coeff[1] + 1.0f - coeff[2]); // 0.539
256 double cr_fac = 1.0 / (1.0f - coeff[0] + coeff[1] + coeff[2]); // 0.635
258 unique_ptr<float[]> temp_cb(new float[WIDTH * HEIGHT]);
259 unique_ptr<float[]> temp_cr(new float[WIDTH * HEIGHT]);
260 for (unsigned yb = 0; yb < HEIGHT; ++yb) {
261 for (unsigned xb = 0; xb < WIDTH; ++xb) {
262 int r = rgb[((yb * WIDTH) + xb) * 3 + 0];
263 int g = rgb[((yb * WIDTH) + xb) * 3 + 1];
264 int b = rgb[((yb * WIDTH) + xb) * 3 + 2];
265 double y = std::min(std::max(coeff[0] * r + coeff[1] * g + coeff[2] * b, 0.0), 255.0);
266 double cb = (b - y) * cb_fac + 128.0;
267 double cr = (r - y) * cr_fac + 128.0;
268 pix_y[(yb * WIDTH) + xb] = lrint(y);
269 temp_cb[(yb * WIDTH) + xb] = cb;
270 temp_cr[(yb * WIDTH) + xb] = cr;
274 // Simple 4:2:2 subsampling with left convention.
275 for (unsigned yb = 0; yb < HEIGHT; ++yb) {
276 for (unsigned xb = 0; xb < WIDTH / 2; ++xb) {
277 int c0 = yb * WIDTH + std::max(int(xb) * 2 - 1, 0);
278 int c1 = yb * WIDTH + xb * 2;
279 int c2 = yb * WIDTH + xb * 2 + 1;
281 double cb = 0.25 * temp_cb[c0] + 0.5 * temp_cb[c1] + 0.25 * temp_cb[c2];
282 double cr = 0.25 * temp_cr[c0] + 0.5 * temp_cr[c1] + 0.25 * temp_cr[c2];
283 cb = std::min(std::max(cb, 0.0), 255.0);
284 cr = std::min(std::max(cr, 0.0), 255.0);
285 pix_cb[(yb * WIDTH/2) + xb] = lrint(cb);
286 pix_cr[(yb * WIDTH/2) + xb] = lrint(cr);
291 int main(int argc, char **argv)
293 // Set up an OpenGL context using SDL.
294 if (SDL_Init(SDL_INIT_VIDEO) == -1) {
295 fprintf(stderr, "SDL_Init failed: %s\n", SDL_GetError());
298 SDL_GL_SetAttribute(SDL_GL_DEPTH_SIZE, 0);
299 SDL_GL_SetAttribute(SDL_GL_STENCIL_SIZE, 0);
300 SDL_GL_SetAttribute(SDL_GL_DOUBLEBUFFER, 1);
301 SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
302 SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
303 SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 5);
305 SDL_Window *window = SDL_CreateWindow("OpenGL window for unit test",
306 SDL_WINDOWPOS_UNDEFINED,
307 SDL_WINDOWPOS_UNDEFINED,
310 SDL_GLContext context = SDL_GL_CreateContext(window);
311 assert(context != nullptr);
314 readpix(rgb, argv[1]);
316 readpix(rgb, "color.pnm");
319 // Compile the shader.
320 string shader_src = ::read_file("encoder.shader");
321 GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
322 GLuint glsl_program_num = glCreateProgram();
323 glAttachShader(glsl_program_num, shader_num);
324 glLinkProgram(glsl_program_num);
327 glGetProgramiv(glsl_program_num, GL_LINK_STATUS, &success);
328 if (success == GL_FALSE) {
329 GLchar error_log[1024] = {0};
330 glGetProgramInfoLog(glsl_program_num, 1024, nullptr, error_log);
331 fprintf(stderr, "Error linking program: %s\n", error_log);
335 // Compile the tally shader.
336 shader_src = ::read_file("tally.shader");
337 shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
338 GLuint glsl_tally_program_num = glCreateProgram();
339 glAttachShader(glsl_tally_program_num, shader_num);
340 glLinkProgram(glsl_tally_program_num);
342 glGetProgramiv(glsl_tally_program_num, GL_LINK_STATUS, &success);
343 if (success == GL_FALSE) {
344 GLchar error_log[1024] = {0};
345 glGetProgramInfoLog(glsl_tally_program_num, 1024, nullptr, error_log);
346 fprintf(stderr, "Error linking program: %s\n", error_log);
350 glUseProgram(glsl_program_num);
352 // An SSBO for the rANS distributions.
354 glGenBuffers(1, &ssbo);
355 glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
356 glBufferData(GL_SHADER_STORAGE_BUFFER, 256 * 4 * sizeof(uint32_t), nullptr, GL_DYNAMIC_COPY);
357 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
361 glGenTextures(1, &y_tex);
362 glBindTexture(GL_TEXTURE_2D, y_tex);
363 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
364 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
365 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
366 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
367 glTexImage2D(GL_TEXTURE_2D, 0, GL_R8UI, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_UNSIGNED_BYTE, pix_y);
370 // Make destination textures.
371 GLuint dc_ac7_tex, ac1_ac6_tex, ac2_ac5_tex;
372 for (GLuint *tex : { &dc_ac7_tex, &ac1_ac6_tex, &ac2_ac5_tex }) {
373 glGenTextures(1, tex);
374 glBindTexture(GL_TEXTURE_2D, *tex);
375 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
376 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
377 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
378 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
379 glTexImage2D(GL_TEXTURE_2D, 0, GL_R16UI, WIDTH / 8, HEIGHT, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, nullptr);
383 GLuint ac3_tex, ac4_tex;
384 for (GLuint *tex : { &ac3_tex, &ac4_tex }) {
385 glGenTextures(1, tex);
386 glBindTexture(GL_TEXTURE_2D, *tex);
387 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
388 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
389 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
390 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
391 glTexImage2D(GL_TEXTURE_2D, 0, GL_R8I, WIDTH / 8, HEIGHT, 0, GL_RED_INTEGER, GL_BYTE, nullptr);
395 GLint dc_ac7_tex_uniform = glGetUniformLocation(glsl_program_num, "dc_ac7_tex");
396 GLint ac1_ac6_tex_uniform = glGetUniformLocation(glsl_program_num, "ac1_ac6_tex");
397 GLint ac2_ac5_tex_uniform = glGetUniformLocation(glsl_program_num, "ac2_ac5_tex");
398 GLint ac3_tex_uniform = glGetUniformLocation(glsl_program_num, "ac3_tex");
399 GLint ac4_tex_uniform = glGetUniformLocation(glsl_program_num, "ac4_tex");
400 GLint image_tex_uniform = glGetUniformLocation(glsl_program_num, "image_tex");
402 glUniform1i(dc_ac7_tex_uniform, 0);
403 glUniform1i(ac1_ac6_tex_uniform, 1);
404 glUniform1i(ac2_ac5_tex_uniform, 2);
405 glUniform1i(ac3_tex_uniform, 3);
406 glUniform1i(ac4_tex_uniform, 4);
407 glUniform1i(image_tex_uniform, 5);
408 glBindImageTexture(0, dc_ac7_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
409 glBindImageTexture(1, ac1_ac6_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
410 glBindImageTexture(2, ac2_ac5_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
411 glBindImageTexture(3, ac3_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I);
412 glBindImageTexture(4, ac4_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I);
413 glBindImageTexture(5, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
416 glUseProgram(glsl_tally_program_num);
417 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
419 steady_clock::time_point start = steady_clock::now();
420 unsigned num_iterations = 1000;
421 for (unsigned i = 0; i < num_iterations; ++i) {
422 glClearNamedBufferSubData(ssbo, GL_R8, 0, 256 * 4 * sizeof(uint32_t), GL_RED, GL_UNSIGNED_BYTE, nullptr);
423 glUseProgram(glsl_program_num);
424 glDispatchCompute(WIDTH_BLOCKS / 16, HEIGHT_BLOCKS, 1);
425 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
427 glUseProgram(glsl_tally_program_num);
428 glDispatchCompute(4, 1, 1);
432 steady_clock::time_point now = steady_clock::now();
434 // CPU part starts here -- will be GPU later.
435 // We only do luma for now.
437 int16_t *coeff_y = new int16_t[WIDTH * HEIGHT];
439 glBindTexture(GL_TEXTURE_2D, dc_ac7_tex);
440 uint16_t *dc_ac7_data = new uint16_t[(WIDTH/8) * HEIGHT];
441 glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, dc_ac7_data);
444 glBindTexture(GL_TEXTURE_2D, ac1_ac6_tex);
445 uint16_t *ac1_ac6_data = new uint16_t[(WIDTH/8) * HEIGHT];
446 glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ac1_ac6_data);
449 glBindTexture(GL_TEXTURE_2D, ac2_ac5_tex);
450 uint16_t *ac2_ac5_data = new uint16_t[(WIDTH/8) * HEIGHT];
451 glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ac2_ac5_data);
454 glBindTexture(GL_TEXTURE_2D, ac3_tex);
455 int8_t *ac3_data = new int8_t[(WIDTH/8) * HEIGHT];
456 glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_BYTE, ac3_data);
459 glBindTexture(GL_TEXTURE_2D, ac4_tex);
460 int8_t *ac4_data = new int8_t[(WIDTH/8) * HEIGHT];
461 glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_BYTE, ac4_data);
464 for (unsigned y = 0; y < HEIGHT; ++y) {
465 for (unsigned xb = 0; xb < WIDTH/8; ++xb) {
466 coeff_y[y * WIDTH + xb*8 + 0] = int(dc_ac7_data[y * (WIDTH/8) + xb] << 23) >> 23;
467 coeff_y[y * WIDTH + xb*8 + 7] = int(dc_ac7_data[y * (WIDTH/8) + xb] << 16) >> 25;
468 coeff_y[y * WIDTH + xb*8 + 1] = int(ac1_ac6_data[y * (WIDTH/8) + xb] << 23) >> 23;
469 coeff_y[y * WIDTH + xb*8 + 6] = int(ac1_ac6_data[y * (WIDTH/8) + xb] << 16) >> 25;
470 coeff_y[y * WIDTH + xb*8 + 2] = int(ac2_ac5_data[y * (WIDTH/8) + xb] << 23) >> 23;
471 coeff_y[y * WIDTH + xb*8 + 5] = int(ac2_ac5_data[y * (WIDTH/8) + xb] << 16) >> 25;
472 coeff_y[y * WIDTH + xb*8 + 3] = ac3_data[y * (WIDTH/8) + xb];
473 coeff_y[y * WIDTH + xb*8 + 4] = ac4_data[y * (WIDTH/8) + xb];
478 for (unsigned y = 0; y < HEIGHT; ++y) {
479 for (unsigned xb = 0; xb < WIDTH/8; ++xb) {
480 printf("%4d %4d %4d %4d %4d %4d %4d %4d | ",
481 coeff_y[y * WIDTH + xb*8 + 0],
482 coeff_y[y * WIDTH + xb*8 + 1],
483 coeff_y[y * WIDTH + xb*8 + 2],
484 coeff_y[y * WIDTH + xb*8 + 3],
485 coeff_y[y * WIDTH + xb*8 + 4],
486 coeff_y[y * WIDTH + xb*8 + 5],
487 coeff_y[y * WIDTH + xb*8 + 6],
488 coeff_y[y * WIDTH + xb*8 + 7]);
489 printf("%4d %4d %4d %4d %4d %4d %4d %4d || ",
490 pix_y[y * WIDTH + xb*8 + 0],
491 pix_y[y * WIDTH + xb*8 + 1],
492 pix_y[y * WIDTH + xb*8 + 2],
493 pix_y[y * WIDTH + xb*8 + 3],
494 pix_y[y * WIDTH + xb*8 + 4],
495 pix_y[y * WIDTH + xb*8 + 5],
496 pix_y[y * WIDTH + xb*8 + 6],
497 pix_y[y * WIDTH + xb*8 + 7]);
503 // DC coefficient pred from the right to left (within each slice)
504 for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; block_idx += BLOCKS_PER_STREAM) {
507 for (unsigned subblock_idx = BLOCKS_PER_STREAM; subblock_idx --> 0; ) {
508 unsigned yb = (block_idx + subblock_idx) / WIDTH_BLOCKS;
509 unsigned xb = (block_idx + subblock_idx) % WIDTH_BLOCKS;
510 int k = coeff_y[(yb * 8) * WIDTH + xb * 8];
512 coeff_y[(yb * 8) * WIDTH + xb * 8] = k - prev_k;
518 // For each coefficient, make some tables.
519 size_t extra_bits = 0;
520 for (unsigned i = 0; i < 64; ++i) {
523 for (unsigned y = 0; y < 8; ++y) {
524 for (unsigned x = 0; x < 8; ++x) {
525 SymbolStats &s_luma = stats[pick_stats_for(x, y)];
528 for (unsigned yb = 0; yb < HEIGHT; yb += 8) {
529 for (unsigned xb = 0; xb < WIDTH; xb += 8) {
530 unsigned short k = abs(coeff_y[(yb + y) * WIDTH + (xb + x)]);
531 if (k >= ESCAPE_LIMIT) {
533 extra_bits += 12; // escape this one
535 ++s_luma.freqs[(k - 1) & (NUM_SYMS - 1)];
541 for (unsigned i = 0; i < 64; ++i) {
542 stats[i].freqs[NUM_SYMS - 1] /= 2; // zero, has no sign bits (yes, this is trickery)
543 stats[i].normalize_freqs(prob_scale);
544 stats[i].cum_freqs[NUM_SYMS] += stats[i].freqs[NUM_SYMS - 1];
545 stats[i].freqs[NUM_SYMS - 1] *= 2;
548 FILE *codedfp = fopen("coded.dat", "wb");
549 if (codedfp == nullptr) {
554 for (unsigned r = 0; r < 2; ++r) { // Hack to write fake chroma tables.
555 // TODO: rather gamma-k or something
556 for (unsigned i = 0; i < 64; ++i) {
557 if (stats[i].cum_freqs[NUM_SYMS] == 0) {
560 printf("writing table %d\n", i);
561 for (unsigned j = 0; j < NUM_SYMS; ++j) {
562 write_varint(stats[i].freqs[j], codedfp);
567 RansEncoder rans_encoder;
569 size_t tot_bytes = 0;
572 for (unsigned y = 0; y < 8; ++y) {
573 for (unsigned x = 0; x < 8; ++x) {
574 SymbolStats &s_luma = stats[pick_stats_for(x, y)];
575 rans_encoder.init_prob(s_luma);
578 std::vector<int> lens;
580 rans_encoder.clear();
581 size_t num_bytes = 0;
582 for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; ++block_idx) {
583 unsigned yb = block_idx / WIDTH_BLOCKS;
584 unsigned xb = block_idx % WIDTH_BLOCKS;
586 int k = coeff_y[(yb * 8 + y) * WIDTH + (xb * 8 + x)];
587 rans_encoder.encode_coeff(k);
589 if (block_idx % BLOCKS_PER_STREAM == (BLOCKS_PER_STREAM - 1) || block_idx == NUM_BLOCKS - 1) {
590 int l = rans_encoder.save_block(codedfp);
595 tot_bytes += num_bytes;
596 printf("coeff %d Y': %ld bytes\n", y * 8 + x, num_bytes);
600 printf("%ld bytes + %ld escape bits (%ld) = %ld total bytes\n",
601 tot_bytes - extra_bits / 8,
607 printf("Each iteration took %.3f ms (but note that is DCT only, no rANS).\n", 1e3 * duration<double>(now - start).count() / num_iterations);
610 glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
611 const uint32_t *dist = (const uint32_t *)glMapBuffer(GL_SHADER_STORAGE_BUFFER, GL_READ_ONLY);
612 for (int i = 0; i < 1024; ++i) {
613 printf("%d,%d: %u\n", i / 256, i % 256, dist[i]);