8 #include <SDL2/SDL_error.h>
9 #include <SDL2/SDL_video.h>
18 #include <unordered_map>
20 #include <movit/util.h>
22 #include "ryg_rans/rans_byte.h"
23 #include "ryg_rans/renormalize.h"
28 #define WIDTH_BLOCKS (WIDTH/8)
29 #define WIDTH_BLOCKS_CHROMA (WIDTH/16)
30 #define HEIGHT_BLOCKS (HEIGHT/8)
31 #define NUM_BLOCKS (WIDTH_BLOCKS * HEIGHT_BLOCKS)
32 #define NUM_BLOCKS_CHROMA (WIDTH_BLOCKS_CHROMA * HEIGHT_BLOCKS)
35 #define ESCAPE_LIMIT (NUM_SYMS - 1)
36 #define BLOCKS_PER_STREAM 320
38 static constexpr uint32_t prob_bits = 12;
39 static constexpr uint32_t prob_scale = 1 << prob_bits;
41 unsigned char rgb[WIDTH * HEIGHT * 3];
42 unsigned char pix_y[WIDTH * HEIGHT];
43 unsigned char pix_cb[(WIDTH/2) * HEIGHT];
44 unsigned char pix_cr[(WIDTH/2) * HEIGHT];
47 using namespace std::chrono;
49 void write_varint(int x, FILE *fp)
52 putc((x & 0x7f) | 0x80, fp);
58 void readpix(unsigned char *ptr, const char *filename)
60 FILE *fp = fopen(filename, "rb");
66 fseek(fp, 0, SEEK_END);
68 assert(len >= WIDTH * HEIGHT * 3);
69 fseek(fp, len - WIDTH * HEIGHT * 3, SEEK_SET);
71 fread(ptr, 1, WIDTH * HEIGHT * 3, fp);
77 uint32_t freqs[NUM_SYMS];
78 uint32_t cum_freqs[NUM_SYMS + 1];
81 void calc_cum_freqs();
82 void normalize_freqs(uint32_t target_total);
85 void SymbolStats::clear()
87 for (int i=0; i < NUM_SYMS; i++)
91 void SymbolStats::calc_cum_freqs()
94 for (int i=0; i < NUM_SYMS; i++)
95 cum_freqs[i+1] = cum_freqs[i] + freqs[i];
98 void SymbolStats::normalize_freqs(uint32_t target_total)
100 uint64_t real_freq[NUM_SYMS + 1]; // hack
102 assert(target_total >= NUM_SYMS);
105 uint32_t cur_total = cum_freqs[NUM_SYMS];
107 if (cur_total == 0) return;
109 double ideal_cost = 0.0;
110 for (int i = 1; i <= NUM_SYMS; i++)
112 real_freq[i] = cum_freqs[i] - cum_freqs[i - 1];
113 if (real_freq[i] > 0)
114 ideal_cost -= real_freq[i] * log2(real_freq[i] / double(cur_total));
117 OptimalRenormalize(cum_freqs, NUM_SYMS, prob_scale);
119 // calculate updated freqs and make sure we didn't screw anything up
120 assert(cum_freqs[0] == 0 && cum_freqs[NUM_SYMS] == target_total);
121 for (int i=0; i < NUM_SYMS; i++) {
123 assert(cum_freqs[i+1] == cum_freqs[i]);
125 assert(cum_freqs[i+1] > cum_freqs[i]);
128 freqs[i] = cum_freqs[i+1] - cum_freqs[i];
131 double calc_cost = 0.0;
132 for (int i = 1; i <= NUM_SYMS; i++)
134 uint64_t freq = cum_freqs[i] - cum_freqs[i - 1];
135 if (real_freq[i] > 0)
136 calc_cost -= real_freq[i] * log2(freq / double(target_total));
139 static double total_loss = 0.0;
140 total_loss += calc_cost - ideal_cost;
141 static double total_loss_with_dp = 0.0;
142 double optimal_cost = 0.0;
143 //total_loss_with_dp += optimal_cost - ideal_cost;
144 printf("ideal cost = %.0f bits, DP cost = %.0f bits, calc cost = %.0f bits (loss = %.2f bytes, total loss = %.2f bytes, total loss with DP = %.2f bytes)\n",
145 ideal_cost, optimal_cost,
146 calc_cost, (calc_cost - ideal_cost) / 8.0, total_loss / 8.0, total_loss_with_dp / 8.0);
149 SymbolStats stats[128];
151 const int luma_mapping[64] = {
152 0, 0, 1, 1, 2, 2, 3, 3,
153 0, 0, 1, 2, 2, 2, 3, 3,
154 1, 1, 2, 2, 2, 3, 3, 3,
155 1, 1, 2, 2, 2, 3, 3, 3,
156 1, 2, 2, 2, 2, 3, 3, 3,
157 2, 2, 2, 2, 3, 3, 3, 3,
158 2, 2, 3, 3, 3, 3, 3, 3,
159 3, 3, 3, 3, 3, 3, 3, 3,
162 int pick_stats_for(int x, int y)
164 return luma_mapping[y * 8 + x];
171 out_buf.reset(new uint8_t[out_max_size]);
175 void init_prob(SymbolStats &s)
177 for (int i = 0; i < NUM_SYMS; i++) {
178 //printf("%d: cumfreqs=%d freqs=%d prob_bits=%d\n", i, s.cum_freqs[i], s.freqs[i], prob_bits + 1);
179 RansEncSymbolInit(&esyms[i], s.cum_freqs[i], s.freqs[i], prob_bits + 1);
181 sign_bias = s.cum_freqs[NUM_SYMS];
186 out_end = out_buf.get() + out_max_size;
187 ptr = out_end; // *end* of output buffer
191 uint32_t save_block(FILE *codedfp) // Returns number of bytes.
193 RansEncFlush(&rans, &ptr);
194 //printf("post-flush = %08x\n", rans);
196 uint32_t num_rans_bytes = out_end - ptr;
197 if (num_rans_bytes == last_block.size() &&
198 memcmp(last_block.data(), ptr, last_block.size()) == 0) {
199 write_varint(0, codedfp);
203 last_block = string((const char *)ptr, num_rans_bytes);
206 write_varint(num_rans_bytes, codedfp);
207 //fwrite(&num_rans_bytes, 1, 4, codedfp);
208 fwrite(ptr, 1, num_rans_bytes, codedfp);
210 //printf("first rANS bytes: %02x %02x %02x %02x %02x %02x %02x %02x\n", ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7]);
215 //printf("Saving block: %d rANS bytes\n", num_rans_bytes);
216 return num_rans_bytes;
217 //return num_rans_bytes;
220 void encode_coeff(short signed_k)
222 //printf("encoding coeff %d (sym %d), rans before encoding = %08x\n", signed_k, ((abs(signed_k) - 1) & 255), rans);
223 unsigned short k = abs(signed_k);
224 if (k >= ESCAPE_LIMIT) {
225 // Put the coefficient as a 1/(2^12) symbol _before_
226 // the 255 coefficient, since the decoder will read the
227 // 255 coefficient first.
228 RansEncPut(&rans, &ptr, k, 1, prob_bits);
231 RansEncPutSymbol(&rans, &ptr, &esyms[(k - 1) & (NUM_SYMS - 1)]);
238 static constexpr size_t out_max_size = 32 << 20; // 32 MB.
239 static constexpr size_t max_num_sign = 1048576; // Way too big. And actually bytes.
241 unique_ptr<uint8_t[]> out_buf;
245 RansEncSymbol esyms[NUM_SYMS];
248 std::string last_block;
251 // Should be done on the GPU, of course, but irrelevant for the demonstration.
254 double coeff[3] = { 0.2126, 0.7152, 0.0722 }; // sum = 1.0
255 double cb_fac = 1.0 / (coeff[0] + coeff[1] + 1.0f - coeff[2]); // 0.539
256 double cr_fac = 1.0 / (1.0f - coeff[0] + coeff[1] + coeff[2]); // 0.635
258 unique_ptr<float[]> temp_cb(new float[WIDTH * HEIGHT]);
259 unique_ptr<float[]> temp_cr(new float[WIDTH * HEIGHT]);
260 for (unsigned yb = 0; yb < HEIGHT; ++yb) {
261 for (unsigned xb = 0; xb < WIDTH; ++xb) {
262 int r = rgb[((yb * WIDTH) + xb) * 3 + 0];
263 int g = rgb[((yb * WIDTH) + xb) * 3 + 1];
264 int b = rgb[((yb * WIDTH) + xb) * 3 + 2];
265 double y = std::min(std::max(coeff[0] * r + coeff[1] * g + coeff[2] * b, 0.0), 255.0);
266 double cb = (b - y) * cb_fac + 128.0;
267 double cr = (r - y) * cr_fac + 128.0;
268 pix_y[(yb * WIDTH) + xb] = lrint(y);
269 temp_cb[(yb * WIDTH) + xb] = cb;
270 temp_cr[(yb * WIDTH) + xb] = cr;
274 // Simple 4:2:2 subsampling with left convention.
275 for (unsigned yb = 0; yb < HEIGHT; ++yb) {
276 for (unsigned xb = 0; xb < WIDTH / 2; ++xb) {
277 int c0 = yb * WIDTH + std::max(int(xb) * 2 - 1, 0);
278 int c1 = yb * WIDTH + xb * 2;
279 int c2 = yb * WIDTH + xb * 2 + 1;
281 double cb = 0.25 * temp_cb[c0] + 0.5 * temp_cb[c1] + 0.25 * temp_cb[c2];
282 double cr = 0.25 * temp_cr[c0] + 0.5 * temp_cr[c1] + 0.25 * temp_cr[c2];
283 cb = std::min(std::max(cb, 0.0), 255.0);
284 cr = std::min(std::max(cr, 0.0), 255.0);
285 pix_cb[(yb * WIDTH/2) + xb] = lrint(cb);
286 pix_cr[(yb * WIDTH/2) + xb] = lrint(cr);
291 int main(int argc, char **argv)
293 // Set up an OpenGL context using SDL.
294 if (SDL_Init(SDL_INIT_VIDEO) == -1) {
295 fprintf(stderr, "SDL_Init failed: %s\n", SDL_GetError());
298 SDL_GL_SetAttribute(SDL_GL_DEPTH_SIZE, 0);
299 SDL_GL_SetAttribute(SDL_GL_STENCIL_SIZE, 0);
300 SDL_GL_SetAttribute(SDL_GL_DOUBLEBUFFER, 1);
301 SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
302 SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
303 SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 5);
305 SDL_Window *window = SDL_CreateWindow("OpenGL window for unit test",
306 SDL_WINDOWPOS_UNDEFINED,
307 SDL_WINDOWPOS_UNDEFINED,
310 SDL_GLContext context = SDL_GL_CreateContext(window);
311 assert(context != nullptr);
314 readpix(rgb, argv[1]);
316 readpix(rgb, "color.pnm");
319 // Compile the shader.
320 string shader_src = ::read_file("encoder.shader");
321 GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
322 GLuint glsl_program_num = glCreateProgram();
323 glAttachShader(glsl_program_num, shader_num);
324 glLinkProgram(glsl_program_num);
327 glGetProgramiv(glsl_program_num, GL_LINK_STATUS, &success);
328 if (success == GL_FALSE) {
329 GLchar error_log[1024] = {0};
330 glGetProgramInfoLog(glsl_program_num, 1024, nullptr, error_log);
331 fprintf(stderr, "Error linking program: %s\n", error_log);
335 glUseProgram(glsl_program_num);
339 glGenTextures(1, &y_tex);
340 glBindTexture(GL_TEXTURE_2D, y_tex);
341 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
342 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
343 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
344 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
345 glTexImage2D(GL_TEXTURE_2D, 0, GL_R8UI, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_UNSIGNED_BYTE, pix_y);
348 // Make destination textures.
349 GLuint dc_ac7_tex, ac1_ac6_tex, ac2_ac5_tex;
350 for (GLuint *tex : { &dc_ac7_tex, &ac1_ac6_tex, &ac2_ac5_tex }) {
351 glGenTextures(1, tex);
352 glBindTexture(GL_TEXTURE_2D, *tex);
353 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
354 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
355 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
356 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
357 glTexImage2D(GL_TEXTURE_2D, 0, GL_R16UI, WIDTH / 8, HEIGHT, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, nullptr);
361 GLuint ac3_tex, ac4_tex;
362 for (GLuint *tex : { &ac3_tex, &ac4_tex }) {
363 glGenTextures(1, tex);
364 glBindTexture(GL_TEXTURE_2D, *tex);
365 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
366 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
367 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
368 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
369 glTexImage2D(GL_TEXTURE_2D, 0, GL_R8I, WIDTH / 8, HEIGHT, 0, GL_RED_INTEGER, GL_BYTE, nullptr);
373 GLint dc_ac7_tex_uniform = glGetUniformLocation(glsl_program_num, "dc_ac7_tex");
374 GLint ac1_ac6_tex_uniform = glGetUniformLocation(glsl_program_num, "ac1_ac6_tex");
375 GLint ac2_ac5_tex_uniform = glGetUniformLocation(glsl_program_num, "ac2_ac5_tex");
376 GLint ac3_tex_uniform = glGetUniformLocation(glsl_program_num, "ac3_tex");
377 GLint ac4_tex_uniform = glGetUniformLocation(glsl_program_num, "ac4_tex");
378 GLint image_tex_uniform = glGetUniformLocation(glsl_program_num, "image_tex");
380 glUniform1i(dc_ac7_tex_uniform, 0);
381 glUniform1i(ac1_ac6_tex_uniform, 1);
382 glUniform1i(ac2_ac5_tex_uniform, 2);
383 glUniform1i(ac3_tex_uniform, 3);
384 glUniform1i(ac4_tex_uniform, 4);
385 glUniform1i(image_tex_uniform, 5);
386 glBindImageTexture(0, dc_ac7_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
387 glBindImageTexture(1, ac1_ac6_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
388 glBindImageTexture(2, ac2_ac5_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
389 glBindImageTexture(3, ac3_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I);
390 glBindImageTexture(4, ac4_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I);
391 glBindImageTexture(5, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
394 steady_clock::time_point start = steady_clock::now();
395 unsigned num_iterations = 1000;
396 for (unsigned i = 0; i < num_iterations; ++i) {
397 glDispatchCompute(WIDTH_BLOCKS, HEIGHT_BLOCKS, 1);
401 steady_clock::time_point now = steady_clock::now();
403 // CPU part starts here -- will be GPU later.
404 // We only do luma for now.
406 int16_t *coeff_y = new int16_t[WIDTH * HEIGHT];
408 glBindTexture(GL_TEXTURE_2D, dc_ac7_tex);
409 uint16_t *dc_ac7_data = new uint16_t[(WIDTH/8) * HEIGHT];
410 glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, dc_ac7_data);
413 glBindTexture(GL_TEXTURE_2D, ac1_ac6_tex);
414 uint16_t *ac1_ac6_data = new uint16_t[(WIDTH/8) * HEIGHT];
415 glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ac1_ac6_data);
418 glBindTexture(GL_TEXTURE_2D, ac2_ac5_tex);
419 uint16_t *ac2_ac5_data = new uint16_t[(WIDTH/8) * HEIGHT];
420 glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ac2_ac5_data);
423 glBindTexture(GL_TEXTURE_2D, ac3_tex);
424 int8_t *ac3_data = new int8_t[(WIDTH/8) * HEIGHT];
425 glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_BYTE, ac3_data);
428 glBindTexture(GL_TEXTURE_2D, ac4_tex);
429 int8_t *ac4_data = new int8_t[(WIDTH/8) * HEIGHT];
430 glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_BYTE, ac4_data);
433 for (unsigned y = 0; y < HEIGHT; ++y) {
434 for (unsigned xb = 0; xb < WIDTH/8; ++xb) {
435 coeff_y[y * WIDTH + xb*8 + 0] = int(dc_ac7_data[y * (WIDTH/8) + xb] << 23) >> 23;
436 coeff_y[y * WIDTH + xb*8 + 7] = int(dc_ac7_data[y * (WIDTH/8) + xb] << 16) >> 25;
437 coeff_y[y * WIDTH + xb*8 + 1] = int(ac1_ac6_data[y * (WIDTH/8) + xb] << 23) >> 23;
438 coeff_y[y * WIDTH + xb*8 + 6] = int(ac1_ac6_data[y * (WIDTH/8) + xb] << 16) >> 25;
439 coeff_y[y * WIDTH + xb*8 + 2] = int(ac2_ac5_data[y * (WIDTH/8) + xb] << 23) >> 23;
440 coeff_y[y * WIDTH + xb*8 + 5] = int(ac2_ac5_data[y * (WIDTH/8) + xb] << 16) >> 25;
441 coeff_y[y * WIDTH + xb*8 + 3] = ac3_data[y * (WIDTH/8) + xb];
442 coeff_y[y * WIDTH + xb*8 + 4] = ac4_data[y * (WIDTH/8) + xb];
447 for (unsigned y = 0; y < HEIGHT; ++y) {
448 for (unsigned xb = 0; xb < WIDTH/8; ++xb) {
449 printf("%4d %4d %4d %4d %4d %4d %4d %4d | ",
450 coeff_y[y * WIDTH + xb*8 + 0],
451 coeff_y[y * WIDTH + xb*8 + 1],
452 coeff_y[y * WIDTH + xb*8 + 2],
453 coeff_y[y * WIDTH + xb*8 + 3],
454 coeff_y[y * WIDTH + xb*8 + 4],
455 coeff_y[y * WIDTH + xb*8 + 5],
456 coeff_y[y * WIDTH + xb*8 + 6],
457 coeff_y[y * WIDTH + xb*8 + 7]);
463 // DC coefficient pred from the right to left (within each slice)
464 for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; block_idx += BLOCKS_PER_STREAM) {
467 for (unsigned subblock_idx = BLOCKS_PER_STREAM; subblock_idx --> 0; ) {
468 unsigned yb = (block_idx + subblock_idx) / WIDTH_BLOCKS;
469 unsigned xb = (block_idx + subblock_idx) % WIDTH_BLOCKS;
470 int k = coeff_y[(yb * 8) * WIDTH + xb * 8];
472 coeff_y[(yb * 8) * WIDTH + xb * 8] = k - prev_k;
478 // For each coefficient, make some tables.
479 size_t extra_bits = 0;
480 for (unsigned i = 0; i < 64; ++i) {
483 for (unsigned y = 0; y < 8; ++y) {
484 for (unsigned x = 0; x < 8; ++x) {
485 SymbolStats &s_luma = stats[pick_stats_for(x, y)];
488 for (unsigned yb = 0; yb < HEIGHT; yb += 8) {
489 for (unsigned xb = 0; xb < WIDTH; xb += 8) {
490 unsigned short k = abs(coeff_y[(yb + y) * WIDTH + (xb + x)]);
491 if (k >= ESCAPE_LIMIT) {
493 extra_bits += 12; // escape this one
495 ++s_luma.freqs[(k - 1) & (NUM_SYMS - 1)];
501 for (unsigned i = 0; i < 64; ++i) {
502 stats[i].freqs[NUM_SYMS - 1] /= 2; // zero, has no sign bits (yes, this is trickery)
503 stats[i].normalize_freqs(prob_scale);
504 stats[i].cum_freqs[NUM_SYMS] += stats[i].freqs[NUM_SYMS - 1];
505 stats[i].freqs[NUM_SYMS - 1] *= 2;
508 FILE *codedfp = fopen("coded.dat", "wb");
509 if (codedfp == nullptr) {
514 for (unsigned r = 0; r < 2; ++r) { // Hack to write fake chroma tables.
515 // TODO: rather gamma-k or something
516 for (unsigned i = 0; i < 64; ++i) {
517 if (stats[i].cum_freqs[NUM_SYMS] == 0) {
520 printf("writing table %d\n", i);
521 for (unsigned j = 0; j < NUM_SYMS; ++j) {
522 write_varint(stats[i].freqs[j], codedfp);
527 RansEncoder rans_encoder;
529 size_t tot_bytes = 0;
532 for (unsigned y = 0; y < 8; ++y) {
533 for (unsigned x = 0; x < 8; ++x) {
534 SymbolStats &s_luma = stats[pick_stats_for(x, y)];
535 rans_encoder.init_prob(s_luma);
538 std::vector<int> lens;
540 rans_encoder.clear();
541 size_t num_bytes = 0;
542 for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; ++block_idx) {
543 unsigned yb = block_idx / WIDTH_BLOCKS;
544 unsigned xb = block_idx % WIDTH_BLOCKS;
546 int k = coeff_y[(yb * 8 + y) * WIDTH + (xb * 8 + x)];
547 rans_encoder.encode_coeff(k);
549 if (block_idx % BLOCKS_PER_STREAM == (BLOCKS_PER_STREAM - 1) || block_idx == NUM_BLOCKS - 1) {
550 int l = rans_encoder.save_block(codedfp);
555 tot_bytes += num_bytes;
556 printf("coeff %d Y': %ld bytes\n", y * 8 + x, num_bytes);
560 printf("%ld bytes + %ld escape bits (%ld) = %ld total bytes\n",
561 tot_bytes - extra_bits / 8,
567 printf("Each iteration took %.3f ms (but note that is DCT only, no rANS).\n", 1e3 * duration<double>(now - start).count() / num_iterations);