8 #include "ryg_rans/rans64.h"
9 //#include "ryg_rans/rans_byte.h"
10 #include "ryg_rans/renormalize.h"
17 #include <unordered_map>
21 #define WIDTH_BLOCKS (WIDTH/8)
22 #define WIDTH_BLOCKS_CHROMA (WIDTH/16)
23 #define HEIGHT_BLOCKS (HEIGHT/8)
24 #define NUM_BLOCKS (WIDTH_BLOCKS * HEIGHT_BLOCKS)
25 #define NUM_BLOCKS_CHROMA (WIDTH_BLOCKS_CHROMA * HEIGHT_BLOCKS)
28 #define ESCAPE_LIMIT (NUM_SYMS - 1)
29 #define BLOCKS_PER_STREAM 320
31 // If you set this to 1, the program will try to optimize the placement
32 // of coefficients to rANS probability distributions. This is randomized,
33 // so you might want to run it a few times.
34 #define FIND_OPTIMAL_STREAM_ASSIGNMENT 0
35 #define NUM_CLUSTERS 4
37 static constexpr uint32_t prob_bits = 12;
38 static constexpr uint32_t prob_scale = 1 << prob_bits;
42 void fdct_int32(short *const In);
43 void idct_int32(short *const In);
45 unsigned char rgb[WIDTH * HEIGHT * 3];
46 unsigned char pix_y[WIDTH * HEIGHT];
47 unsigned char pix_cb[(WIDTH/2) * HEIGHT];
48 unsigned char pix_cr[(WIDTH/2) * HEIGHT];
49 unsigned char full_pix_cb[WIDTH * HEIGHT];
50 unsigned char full_pix_cr[WIDTH * HEIGHT];
51 short coeff_y[WIDTH * HEIGHT], coeff_cb[(WIDTH/2) * HEIGHT], coeff_cr[(WIDTH/2) * HEIGHT];
56 if (x > 255) return 255;
60 static const unsigned char std_luminance_quant_tbl[64] = {
62 16, 11, 10, 16, 24, 40, 51, 61,
63 12, 12, 14, 19, 26, 58, 60, 55,
64 14, 13, 16, 24, 40, 57, 69, 56,
65 14, 17, 22, 29, 51, 87, 80, 62,
66 18, 22, 37, 56, 68, 109, 103, 77,
67 24, 35, 55, 64, 81, 104, 113, 92,
68 49, 64, 78, 87, 103, 121, 120, 101,
69 72, 92, 95, 98, 112, 100, 103, 99
71 // ff_mpeg1_default_intra_matrix
72 8, 16, 19, 22, 26, 27, 29, 34,
73 16, 16, 22, 24, 27, 29, 34, 37,
74 19, 22, 26, 27, 29, 34, 34, 38,
75 22, 22, 26, 27, 29, 34, 37, 40,
76 22, 26, 27, 29, 32, 35, 40, 48,
77 26, 27, 29, 32, 35, 40, 48, 58,
78 26, 27, 29, 34, 38, 46, 56, 69,
79 27, 29, 35, 38, 46, 56, 69, 83
85 uint32_t freqs[NUM_SYMS];
86 uint32_t cum_freqs[NUM_SYMS + 1];
89 void calc_cum_freqs();
90 void normalize_freqs(uint32_t target_total);
93 void SymbolStats::clear()
95 for (int i=0; i < NUM_SYMS; i++)
99 void SymbolStats::calc_cum_freqs()
102 for (int i=0; i < NUM_SYMS; i++)
103 cum_freqs[i+1] = cum_freqs[i] + freqs[i];
106 void SymbolStats::normalize_freqs(uint32_t target_total)
108 uint64_t real_freq[NUM_SYMS + 1]; // hack
110 assert(target_total >= NUM_SYMS);
113 uint32_t cur_total = cum_freqs[NUM_SYMS];
115 if (cur_total == 0) return;
117 double ideal_cost = 0.0;
118 for (int i = 1; i <= NUM_SYMS; i++)
120 real_freq[i] = cum_freqs[i] - cum_freqs[i - 1];
121 if (real_freq[i] > 0)
122 ideal_cost -= real_freq[i] * log2(real_freq[i] / double(cur_total));
125 OptimalRenormalize(cum_freqs, NUM_SYMS, prob_scale);
127 // calculate updated freqs and make sure we didn't screw anything up
128 assert(cum_freqs[0] == 0 && cum_freqs[NUM_SYMS] == target_total);
129 for (int i=0; i < NUM_SYMS; i++) {
131 assert(cum_freqs[i+1] == cum_freqs[i]);
133 assert(cum_freqs[i+1] > cum_freqs[i]);
136 freqs[i] = cum_freqs[i+1] - cum_freqs[i];
139 double calc_cost = 0.0;
140 for (int i = 1; i <= NUM_SYMS; i++)
142 uint64_t freq = cum_freqs[i] - cum_freqs[i - 1];
143 if (real_freq[i] > 0)
144 calc_cost -= real_freq[i] * log2(freq / double(target_total));
147 static double total_loss = 0.0;
148 total_loss += calc_cost - ideal_cost;
149 static double total_loss_with_dp = 0.0;
150 double optimal_cost = 0.0;
151 //total_loss_with_dp += optimal_cost - ideal_cost;
152 printf("ideal cost = %.0f bits, DP cost = %.0f bits, calc cost = %.0f bits (loss = %.2f bytes, total loss = %.2f bytes, total loss with DP = %.2f bytes)\n",
153 ideal_cost, optimal_cost,
154 calc_cost, (calc_cost - ideal_cost) / 8.0, total_loss / 8.0, total_loss_with_dp / 8.0);
157 SymbolStats stats[128];
159 #if FIND_OPTIMAL_STREAM_ASSIGNMENT
160 // Distance from one stream to the other, based on a hacked-up K-L divergence.
161 float kl_dist[64][64];
164 const int luma_mapping[64] = {
165 0, 0, 1, 1, 2, 2, 3, 3,
166 0, 0, 1, 2, 2, 2, 3, 3,
167 1, 1, 2, 2, 2, 3, 3, 3,
168 1, 1, 2, 2, 2, 3, 3, 3,
169 1, 2, 2, 2, 2, 3, 3, 3,
170 2, 2, 2, 2, 3, 3, 3, 3,
171 2, 2, 3, 3, 3, 3, 3, 3,
172 3, 3, 3, 3, 3, 3, 3, 3,
174 const int chroma_mapping[64] = {
175 0, 1, 1, 2, 2, 2, 3, 3,
176 1, 1, 2, 2, 2, 3, 3, 3,
177 2, 2, 2, 2, 3, 3, 3, 3,
178 2, 2, 2, 3, 3, 3, 3, 3,
179 2, 3, 3, 3, 3, 3, 3, 3,
180 3, 3, 3, 3, 3, 3, 3, 3,
181 3, 3, 3, 3, 3, 3, 3, 3,
182 3, 3, 3, 3, 3, 3, 3, 3,
185 int pick_stats_for(int x, int y, bool is_chroma)
187 #if FIND_OPTIMAL_STREAM_ASSIGNMENT
188 return y * 8 + x + is_chroma * 64;
191 return chroma_mapping[y * 8 + x] + 4;
193 return luma_mapping[y * 8 + x];
199 void write_varint(int x, FILE *fp)
201 fwrite(&x, sizeof(x), 1, fp);
208 out_buf.reset(new uint8_t[out_max_size]);
212 void init_prob(SymbolStats &s)
214 for (int i = 0; i < NUM_SYMS; i++) {
215 //printf("%d: cumfreqs=%d freqs=%d prob_bits=%d\n", i, s.cum_freqs[i], s.freqs[i], prob_bits + 1);
216 Rans64EncSymbolInit(&esyms[i], s.cum_freqs[i], s.freqs[i], prob_bits + 1);
218 sign_bias = s.cum_freqs[NUM_SYMS];
223 out_end = out_buf.get() + out_max_size;
224 ptr = out_end; // *end* of output buffer
225 Rans64EncInit(&rans);
228 uint32_t save_block(FILE *codedfp) // Returns number of bytes.
230 Rans64EncFlush(&rans, (uint32_t **)&ptr);
231 //printf("post-flush = %08x\n", rans);
233 uint32_t num_rans_bytes = out_end - ptr;
235 if (num_rans_bytes == 4) {
237 memcpy(&block, ptr, 4);
239 if (block == last_block) {
240 write_varint(0, codedfp);
251 write_varint(num_rans_bytes, codedfp);
252 //fwrite(&num_rans_bytes, 1, 4, codedfp);
253 fwrite(ptr, 1, num_rans_bytes, codedfp);
255 //printf("first rANS bytes: %02x %02x %02x %02x %02x %02x %02x %02x\n", ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7]);
260 //printf("Saving block: %d rANS bytes\n", num_rans_bytes);
261 return num_rans_bytes;
262 //return num_rans_bytes;
265 void encode_coeff(short signed_k)
267 //printf("encoding coeff %d (sym %d), rans before encoding = %016lx\n", signed_k, ((abs(signed_k) - 1) & 255), rans);
268 unsigned short k = abs(signed_k);
269 if (k >= ESCAPE_LIMIT) {
270 // Put the coefficient as a 1/(2^12) symbol _before_
271 // the 255 coefficient, since the decoder will read the
272 // 255 coefficient first.
273 Rans64EncPut(&rans, (uint32_t **)&ptr, k, 1, prob_bits);
276 Rans64EncPutSymbol(&rans, (uint32_t **)&ptr, &esyms[(k - 1) & (NUM_SYMS - 1)], prob_bits + 1);
283 static constexpr size_t out_max_size = 32 << 20; // 32 MB.
284 static constexpr size_t max_num_sign = 1048576; // Way too big. And actually bytes.
286 unique_ptr<uint8_t[]> out_buf;
290 Rans64EncSymbol esyms[NUM_SYMS];
293 uint32_t last_block = 0; // Not a valid 4-byte rANS block (?)
296 static constexpr int dc_scalefac = 8; // Matches the FDCT's gain.
297 static constexpr double quant_scalefac = 4.0; // whatever?
299 static inline int quantize(int f, int coeff_idx)
301 if (coeff_idx == 0) {
302 return f / dc_scalefac;
308 const int w = std_luminance_quant_tbl[coeff_idx];
309 const int s = quant_scalefac;
310 int sign_f = (f > 0) ? 1 : -1;
311 return (32 * f + sign_f * w * s) / (2 * w * s);
314 static inline int unquantize(int qf, int coeff_idx)
316 if (coeff_idx == 0) {
317 return qf * dc_scalefac;
323 const int w = std_luminance_quant_tbl[coeff_idx];
324 const int s = quant_scalefac;
325 return (2 * qf * w * s) / 32;
328 void readpix(unsigned char *ptr, const char *filename)
330 FILE *fp = fopen(filename, "rb");
336 fseek(fp, 0, SEEK_END);
337 long len = ftell(fp);
338 assert(len >= WIDTH * HEIGHT * 3);
339 fseek(fp, len - WIDTH * HEIGHT * 3, SEEK_SET);
341 fread(ptr, 1, WIDTH * HEIGHT * 3, fp);
347 double coeff[3] = { 0.2126, 0.7152, 0.0722 }; // sum = 1.0
348 double cb_fac = 1.0 / (coeff[0] + coeff[1] + 1.0f - coeff[2]); // 0.539
349 double cr_fac = 1.0 / (1.0f - coeff[0] + coeff[1] + coeff[2]); // 0.635
351 unique_ptr<float[]> temp_cb(new float[WIDTH * HEIGHT]);
352 unique_ptr<float[]> temp_cr(new float[WIDTH * HEIGHT]);
353 for (unsigned yb = 0; yb < HEIGHT; ++yb) {
354 for (unsigned xb = 0; xb < WIDTH; ++xb) {
355 int r = rgb[((yb * WIDTH) + xb) * 3 + 0];
356 int g = rgb[((yb * WIDTH) + xb) * 3 + 1];
357 int b = rgb[((yb * WIDTH) + xb) * 3 + 2];
358 double y = std::min(std::max(coeff[0] * r + coeff[1] * g + coeff[2] * b, 0.0), 255.0);
359 double cb = (b - y) * cb_fac + 128.0;
360 double cr = (r - y) * cr_fac + 128.0;
361 pix_y[(yb * WIDTH) + xb] = lrint(y);
362 temp_cb[(yb * WIDTH) + xb] = cb;
363 temp_cr[(yb * WIDTH) + xb] = cr;
364 full_pix_cb[(yb * WIDTH) + xb] = lrint(std::min(std::max(cb, 0.0), 255.0));
365 full_pix_cr[(yb * WIDTH) + xb] = lrint(std::min(std::max(cr, 0.0), 255.0));
369 // Simple 4:2:2 subsampling with left convention.
370 for (unsigned yb = 0; yb < HEIGHT; ++yb) {
371 for (unsigned xb = 0; xb < WIDTH / 2; ++xb) {
372 int c0 = yb * WIDTH + std::max(int(xb) * 2 - 1, 0);
373 int c1 = yb * WIDTH + xb * 2;
374 int c2 = yb * WIDTH + xb * 2 + 1;
376 double cb = 0.25 * temp_cb[c0] + 0.5 * temp_cb[c1] + 0.25 * temp_cb[c2];
377 double cr = 0.25 * temp_cr[c0] + 0.5 * temp_cr[c1] + 0.25 * temp_cr[c2];
378 cb = std::min(std::max(cb, 0.0), 255.0);
379 cr = std::min(std::max(cr, 0.0), 255.0);
380 pix_cb[(yb * WIDTH/2) + xb] = lrint(cb);
381 pix_cr[(yb * WIDTH/2) + xb] = lrint(cr);
386 #if FIND_OPTIMAL_STREAM_ASSIGNMENT
387 double find_best_assignment(const int *medoids, int *assignment)
389 double current_score = 0.0;
390 for (int i = 0; i < 64; ++i) {
391 int best_medoid = medoids[0];
392 float best_medoid_score = kl_dist[i][medoids[0]];
393 for (int j = 1; j < NUM_CLUSTERS; ++j) {
394 if (kl_dist[i][medoids[j]] < best_medoid_score) {
395 best_medoid = medoids[j];
396 best_medoid_score = kl_dist[i][medoids[j]];
399 assignment[i] = best_medoid;
400 current_score += best_medoid_score;
402 return current_score;
405 void find_optimal_stream_assignment(int base)
408 for (unsigned i = 0; i < 64; ++i) {
410 for (unsigned k = 0; k < NUM_SYMS; ++k) {
411 s += stats[i + base].freqs[k] + 0.5;
413 inv_sum[i] = 1.0 / s;
416 for (unsigned i = 0; i < 64; ++i) {
417 for (unsigned j = 0; j < 64; ++j) {
419 for (unsigned k = 0; k < NUM_SYMS; ++k) {
420 double p1 = (stats[i + base].freqs[k] + 0.5) * inv_sum[i];
421 double p2 = (stats[j + base].freqs[k] + 0.5) * inv_sum[j];
423 // K-L divergence is asymmetric; this is a hack.
424 d += p1 * log(p1 / p2);
425 d += p2 * log(p2 / p1);
428 //printf("%.3f ", d);
434 int medoids[64]; // only the first NUM_CLUSTERS matter
435 bool is_medoid[64] = { false };
436 std::iota(medoids, medoids + 64, 0);
437 std::random_device rd;
438 std::mt19937 g(rd());
439 std::shuffle(medoids, medoids + 64, g);
440 for (int i = 0; i < NUM_CLUSTERS; ++i) {
441 printf("%d ", medoids[i]);
442 is_medoid[medoids[i]] = true;
446 // assign each data point to the closest medoid
448 double current_score = find_best_assignment(medoids, assignment);
450 for (int i = 0; i < 1000; ++i) {
451 printf("iter %d\n", i);
452 bool any_changed = false;
453 for (int m = 0; m < NUM_CLUSTERS; ++m) {
454 for (int o = 0; o < 64; ++o) {
455 if (is_medoid[o]) continue;
456 int old_medoid = medoids[m];
459 int new_assignment[64];
460 double candidate_score = find_best_assignment(medoids, new_assignment);
462 if (candidate_score < current_score) {
463 current_score = candidate_score;
464 memcpy(assignment, new_assignment, sizeof(assignment));
466 is_medoid[old_medoid] = false;
467 is_medoid[medoids[m]] = true;
468 printf("%f: ", current_score);
469 for (int i = 0; i < 64; ++i) {
470 printf("%d ", assignment[i]);
475 medoids[m] = old_medoid;
479 if (!any_changed) break;
482 std::unordered_map<int, int> rmap;
483 for (int i = 0; i < 64; ++i) {
484 if (i % 8 == 0) printf("\n");
485 if (!rmap.count(assignment[i])) {
486 rmap.emplace(assignment[i], rmap.size());
488 printf("%d, ", rmap[assignment[i]]);
494 int main(int argc, char **argv)
497 readpix(rgb, argv[1]);
499 readpix(rgb, "color.pnm");
502 double sum_sq_err = 0.0;
503 //double last_cb_cfl_fac = 0.0;
504 //double last_cr_cfl_fac = 0.0;
506 // DCT and quantize luma
507 for (unsigned yb = 0; yb < HEIGHT; yb += 8) {
508 for (unsigned xb = 0; xb < WIDTH; xb += 8) {
511 for (unsigned y = 0; y < 8; ++y) {
512 for (unsigned x = 0; x < 8; ++x) {
513 in_y[y * 8 + x] = pix_y[(yb + y) * WIDTH + (xb + x)];
520 for (unsigned y = 0; y < 8; ++y) {
521 for (unsigned x = 0; x < 8; ++x) {
522 int coeff_idx = y * 8 + x;
523 int k = quantize(in_y[coeff_idx], coeff_idx);
524 coeff_y[(yb + y) * WIDTH + (xb + x)] = k;
526 // Store back for reconstruction / PSNR calculation
527 in_y[coeff_idx] = unquantize(k, coeff_idx);
533 for (unsigned y = 0; y < 8; ++y) {
534 for (unsigned x = 0; x < 8; ++x) {
535 int k = clamp(in_y[y * 8 + x]);
536 uint8_t *ptr = &pix_y[(yb + y) * WIDTH + (xb + x)];
537 sum_sq_err += (*ptr - k) * (*ptr - k);
543 double mse = sum_sq_err / double(WIDTH * HEIGHT);
544 double psnr_db = 20 * log10(255.0 / sqrt(mse));
545 printf("psnr = %.2f dB\n", psnr_db);
547 //double chroma_energy = 0.0, chroma_energy_pred = 0.0;
549 // DCT and quantize chroma
550 //double last_cb_cfl_fac = 0.0, last_cr_cfl_fac = 0.0;
551 for (unsigned yb = 0; yb < HEIGHT; yb += 8) {
552 for (unsigned xb = 0; xb < WIDTH/2; xb += 8) {
554 // TF switch: Two 8x8 luma blocks -> one 16x8 block, then drop high frequencies
555 printf("in blocks:\n");
556 for (unsigned y = 0; y < 8; ++y) {
557 for (unsigned x = 0; x < 8; ++x) {
558 short a = coeff_y[(yb + y) * WIDTH + (xb*2 + x)];
562 for (unsigned x = 0; x < 8; ++x) {
563 short b = coeff_y[(yb + y) * WIDTH + (xb*2 + x + 8)];
570 for (unsigned y = 0; y < 8; ++y) {
571 for (unsigned x = 0; x < 4; ++x) {
572 short a = coeff_y[(yb + y) * WIDTH + (xb*2 + x)];
573 short b = coeff_y[(yb + y) * WIDTH + (xb*2 + x + 8)];
576 in_y[y * 8 + x * 2 + 0] = a;
577 in_y[y * 8 + x * 2 + 1] = b;
581 printf("tf-ed block:\n");
582 for (unsigned y = 0; y < 8; ++y) {
583 for (unsigned x = 0; x < 8; ++x) {
584 short a = in_y[y * 8 + x];
590 // Read Y block with no tf switch (from reconstructed luma)
592 for (unsigned y = 0; y < 8; ++y) {
593 for (unsigned x = 0; x < 8; ++x) {
594 in_y[y * 8 + x] = pix_y[(yb + y) * (WIDTH) + (xb + x) * 2];
601 short in_cb[64], in_cr[64];
602 for (unsigned y = 0; y < 8; ++y) {
603 for (unsigned x = 0; x < 8; ++x) {
604 in_cb[y * 8 + x] = pix_cb[(yb + y) * (WIDTH/2) + (xb + x)];
605 in_cr[y * 8 + x] = pix_cr[(yb + y) * (WIDTH/2) + (xb + x)];
618 double denom = (x0 * x0 + x1 * x1 + x2 * x2);
619 //double denom = (x1 * x1);
621 double y0 = in_cb[1];
622 double y1 = in_cb[8];
623 double y2 = in_cb[9];
624 double cb_cfl_fac = (x0 * y0 + x1 * y1 + x2 * y2) / denom;
625 //double cb_cfl_fac = (x1 * y1) / denom;
627 for (unsigned y = 0; y < 8; ++y) {
628 for (unsigned x = 0; x < 8; ++x) {
629 short a = in_y[y * 8 + x];
633 for (unsigned x = 0; x < 8; ++x) {
634 short a = in_cb[y * 8 + x];
639 printf("(%d,%d,%d) -> (%d,%d,%d) gives %f\n",
640 in_y[1], in_y[8], in_y[9],
641 in_cb[1], in_cb[8], in_cb[9],
647 double cr_cfl_fac = (x0 * y0 + x1 * y1 + x2 * y2) / denom;
648 //double cr_cfl_fac = (x1 * y1) / denom;
649 printf("cb CfL = %7.3f dc = %5d cr CfL = %7.3f dc = %d\n",
650 cb_cfl_fac, in_cb[0] - in_y[0],
651 cr_cfl_fac, in_cr[0] - in_y[0]);
653 if (denom == 0.0) { cb_cfl_fac = cr_cfl_fac = 0.0; }
656 //last_cb_cfl_fac = cb_cfl_fac;
657 //last_cr_cfl_fac = cr_cfl_fac;
659 for (unsigned coeff_idx = 1; coeff_idx < 64; ++coeff_idx) {
660 //printf("%2d: cb = %3d prediction = %f * %3d = %7.3f\n", coeff_idx, in_cb[coeff_idx], last_cb_cfl_fac, in_y[coeff_idx], last_cb_cfl_fac * in_y[coeff_idx]);
661 //printf("%2d: cr = %3d prediction = %f * %3d = %7.3f\n", coeff_idx, in_cr[coeff_idx], last_cr_cfl_fac, in_y[coeff_idx], last_cr_cfl_fac * in_y[coeff_idx]);
662 double cb_pred = last_cb_cfl_fac * in_y[coeff_idx];
663 chroma_energy += in_cb[coeff_idx] * in_cb[coeff_idx];
664 chroma_energy_pred += (in_cb[coeff_idx] - cb_pred) * (in_cb[coeff_idx] - cb_pred);
666 //in_cb[coeff_idx] -= lrint(last_cb_cfl_fac * in_y[coeff_idx]);
667 //in_cr[coeff_idx] -= lrint(last_cr_cfl_fac * in_y[coeff_idx]);
668 //in_cr[coeff_idx] -= lrint(last_cr_cfl_fac * in_y[coeff_idx]);
669 //in_cb[coeff_idx] = lrint(in_y[coeff_idx] * (1.0 / sqrt(2)));
670 //in_cr[coeff_idx] = lrint(in_y[coeff_idx] * (1.0 / sqrt(2)));
671 //in_cb[coeff_idx] = lrint(in_y[coeff_idx]);
672 //in_cr[coeff_idx] = lrint(in_y[coeff_idx]);
676 //in_cb[0] -= in_y[0];
677 //in_cr[0] -= in_y[0];
680 for (unsigned y = 0; y < 8; ++y) {
681 for (unsigned x = 0; x < 8; ++x) {
682 int coeff_idx = y * 8 + x;
683 int k_cb = quantize(in_cb[coeff_idx], coeff_idx);
684 coeff_cb[(yb + y) * (WIDTH/2) + (xb + x)] = k_cb;
685 int k_cr = quantize(in_cr[coeff_idx], coeff_idx);
686 coeff_cr[(yb + y) * (WIDTH/2) + (xb + x)] = k_cr;
688 // Store back for reconstruction / PSNR calculation
689 in_cb[coeff_idx] = unquantize(k_cb, coeff_idx);
690 in_cr[coeff_idx] = unquantize(k_cr, coeff_idx);
694 idct_int32(in_y); // DEBUG
698 for (unsigned y = 0; y < 8; ++y) {
699 for (unsigned x = 0; x < 8; ++x) {
700 pix_cb[(yb + y) * (WIDTH/2) + (xb + x)] = clamp(in_cb[y * 8 + x]);
701 pix_cr[(yb + y) * (WIDTH/2) + (xb + x)] = clamp(in_cr[y * 8 + x]);
703 // pix_cb[(yb + y) * (WIDTH/2) + (xb + x)] = in_y[y * 8 + x];
704 // pix_cr[(yb + y) * (WIDTH/2) + (xb + x)] = in_y[y * 8 + x];
709 last_cb_cfl_fac = cb_cfl_fac;
710 last_cr_cfl_fac = cr_cfl_fac;
716 printf("chroma_energy = %f, with_pred = %f\n",
717 chroma_energy / (WIDTH * HEIGHT), chroma_energy_pred / (WIDTH * HEIGHT));
720 // DC coefficient pred from the right to left (within each slice)
721 for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; block_idx += BLOCKS_PER_STREAM) {
724 for (unsigned subblock_idx = BLOCKS_PER_STREAM; subblock_idx --> 0; ) {
725 unsigned yb = (block_idx + subblock_idx) / WIDTH_BLOCKS;
726 unsigned xb = (block_idx + subblock_idx) % WIDTH_BLOCKS;
727 int k = coeff_y[(yb * 8) * WIDTH + (xb * 8)];
729 coeff_y[(yb * 8) * WIDTH + (xb * 8)] = k - prev_k;
734 for (unsigned block_idx = 0; block_idx < NUM_BLOCKS_CHROMA; block_idx += BLOCKS_PER_STREAM) {
738 for (unsigned subblock_idx = BLOCKS_PER_STREAM; subblock_idx --> 0; ) {
739 unsigned yb = (block_idx + subblock_idx) / WIDTH_BLOCKS_CHROMA;
740 unsigned xb = (block_idx + subblock_idx) % WIDTH_BLOCKS_CHROMA;
741 int k_cb = coeff_cb[(yb * 8) * WIDTH/2 + (xb * 8)];
742 int k_cr = coeff_cr[(yb * 8) * WIDTH/2 + (xb * 8)];
744 coeff_cb[(yb * 8) * WIDTH/2 + (xb * 8)] = k_cb - prev_k_cb;
745 coeff_cr[(yb * 8) * WIDTH/2 + (xb * 8)] = k_cr - prev_k_cr;
752 FILE *fp = fopen("reconstructed.pgm", "wb");
753 fprintf(fp, "P5\n%d %d\n255\n", WIDTH, HEIGHT);
754 fwrite(pix_y, 1, WIDTH * HEIGHT, fp);
757 fp = fopen("reconstructed.pnm", "wb");
758 fprintf(fp, "P6\n%d %d\n255\n", WIDTH, HEIGHT);
759 for (unsigned yb = 0; yb < HEIGHT; ++yb) {
760 for (unsigned xb = 0; xb < WIDTH; ++xb) {
761 int y = pix_y[(yb * WIDTH) + xb];
763 int c0 = yb * (WIDTH/2) + xb/2;
765 cb = pix_cb[c0] - 128.0;
766 cr = pix_cr[c0] - 128.0;
768 int c1 = yb * (WIDTH/2) + std::min<int>(xb/2 + 1, WIDTH/2 - 1);
769 cb = 0.5 * (pix_cb[c0] + pix_cb[c1]) - 128.0;
770 cr = 0.5 * (pix_cr[c0] + pix_cr[c1]) - 128.0;
773 double r = y + 1.5748 * cr;
774 double g = y - 0.1873 * cb - 0.4681 * cr;
775 double b = y + 1.8556 * cb;
777 putc(clamp(lrint(r)), fp);
778 putc(clamp(lrint(g)), fp);
779 putc(clamp(lrint(b)), fp);
784 // For each coefficient, make some tables.
785 size_t extra_bits = 0;
786 for (unsigned i = 0; i < 64; ++i) {
789 for (unsigned y = 0; y < 8; ++y) {
790 for (unsigned x = 0; x < 8; ++x) {
791 SymbolStats &s_luma = stats[pick_stats_for(x, y, false)];
792 SymbolStats &s_chroma = stats[pick_stats_for(x, y, true)];
795 for (unsigned yb = 0; yb < HEIGHT; yb += 8) {
796 for (unsigned xb = 0; xb < WIDTH; xb += 8) {
797 unsigned short k = abs(coeff_y[(yb + y) * WIDTH + (xb + x)]);
798 if (k >= ESCAPE_LIMIT) {
800 extra_bits += 12; // escape this one
802 ++s_luma.freqs[(k - 1) & (NUM_SYMS - 1)];
806 for (unsigned yb = 0; yb < HEIGHT; yb += 8) {
807 for (unsigned xb = 0; xb < WIDTH/2; xb += 8) {
808 unsigned short k_cb = abs(coeff_cb[(yb + y) * WIDTH/2 + (xb + x)]);
809 unsigned short k_cr = abs(coeff_cr[(yb + y) * WIDTH/2 + (xb + x)]);
810 if (k_cb >= ESCAPE_LIMIT) {
812 extra_bits += 12; // escape this one
814 if (k_cr >= ESCAPE_LIMIT) {
816 extra_bits += 12; // escape this one
818 ++s_chroma.freqs[(k_cb - 1) & (NUM_SYMS - 1)];
819 ++s_chroma.freqs[(k_cr - 1) & (NUM_SYMS - 1)];
825 #if FIND_OPTIMAL_STREAM_ASSIGNMENT
827 find_optimal_stream_assignment(0);
829 find_optimal_stream_assignment(64);
833 for (unsigned i = 0; i < 64; ++i) {
834 stats[i].freqs[NUM_SYMS - 1] /= 2; // zero, has no sign bits (yes, this is trickery)
835 stats[i].normalize_freqs(prob_scale);
836 stats[i].cum_freqs[NUM_SYMS] += stats[i].freqs[NUM_SYMS - 1];
837 stats[i].freqs[NUM_SYMS - 1] *= 2;
840 FILE *codedfp = fopen("coded.dat", "wb");
841 if (codedfp == nullptr) {
846 // TODO: rather gamma-k or something
847 for (unsigned i = 0; i < 64; ++i) {
848 if (stats[i].cum_freqs[NUM_SYMS] == 0) {
851 printf("writing table %d\n", i);
852 for (unsigned j = 0; j < NUM_SYMS; ++j) {
853 write_varint(stats[i].freqs[j], codedfp);
857 RansEncoder rans_encoder;
859 size_t tot_bytes = 0;
862 for (unsigned y = 0; y < 8; ++y) {
863 for (unsigned x = 0; x < 8; ++x) {
864 SymbolStats &s_luma = stats[pick_stats_for(x, y, false)];
865 rans_encoder.init_prob(s_luma);
868 std::vector<int> lens;
870 // need to reverse later
871 rans_encoder.clear();
872 size_t num_bytes = 0;
873 for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; ++block_idx) {
874 unsigned yb = block_idx / WIDTH_BLOCKS;
875 unsigned xb = block_idx % WIDTH_BLOCKS;
877 int k = coeff_y[(yb * 8 + y) * WIDTH + (xb * 8 + x)];
878 //printf("encoding coeff %d xb,yb=%d,%d: %d\n", y*8+x, xb, yb, k);
879 rans_encoder.encode_coeff(k);
881 if (block_idx % BLOCKS_PER_STREAM == (BLOCKS_PER_STREAM - 1) || block_idx == NUM_BLOCKS - 1) {
882 int l = rans_encoder.save_block(codedfp);
887 tot_bytes += num_bytes;
888 printf("coeff %d Y': %ld bytes\n", y * 8 + x, num_bytes);
894 double avg_l = sum_l / lens.size();
896 double sum_sql = 0.0;
898 sum_sql += (l - avg_l) * (l - avg_l);
900 double stddev_l = sqrt(sum_sql / (lens.size() - 1));
901 printf("coeff %d: avg=%.2f bytes, stddev=%.2f bytes\n", y*8+x, avg_l, stddev_l);
906 for (unsigned y = 0; y < 8; ++y) {
907 for (unsigned x = 0; x < 8; ++x) {
908 SymbolStats &s_chroma = stats[pick_stats_for(x, y, true)];
909 rans_encoder.init_prob(s_chroma);
911 rans_encoder.clear();
912 size_t num_bytes = 0;
913 for (unsigned block_idx = 0; block_idx < NUM_BLOCKS_CHROMA; ++block_idx) {
914 unsigned yb = block_idx / WIDTH_BLOCKS_CHROMA;
915 unsigned xb = block_idx % WIDTH_BLOCKS_CHROMA;
917 int k = coeff_cb[(yb * 8 + y) * WIDTH/2 + (xb * 8 + x)];
918 //printf("encoding coeff %d xb,yb=%d,%d: %d\n", y*8+x, xb, yb, k);
919 rans_encoder.encode_coeff(k);
921 if (block_idx % BLOCKS_PER_STREAM == (BLOCKS_PER_STREAM - 1) || block_idx == NUM_BLOCKS - 1) {
922 num_bytes += rans_encoder.save_block(codedfp);
925 tot_bytes += num_bytes;
926 printf("coeff %d Cb: %ld bytes\n", y * 8 + x, num_bytes);
931 for (unsigned y = 0; y < 8; ++y) {
932 for (unsigned x = 0; x < 8; ++x) {
933 SymbolStats &s_chroma = stats[pick_stats_for(x, y, true)];
934 rans_encoder.init_prob(s_chroma);
936 rans_encoder.clear();
937 size_t num_bytes = 0;
938 for (unsigned block_idx = 0; block_idx < NUM_BLOCKS_CHROMA; ++block_idx) {
939 unsigned yb = block_idx / WIDTH_BLOCKS_CHROMA;
940 unsigned xb = block_idx % WIDTH_BLOCKS_CHROMA;
942 int k = coeff_cr[(yb * 8 + y) * WIDTH/2 + (xb * 8 + x)];
943 //printf("encoding coeff %d xb,yb=%d,%d: %d\n", y*8+x, xb, yb, k);
944 rans_encoder.encode_coeff(k);
946 if (block_idx % BLOCKS_PER_STREAM == (BLOCKS_PER_STREAM - 1) || block_idx == NUM_BLOCKS - 1) {
947 num_bytes += rans_encoder.save_block(codedfp);
950 tot_bytes += num_bytes;
951 printf("coeff %d Cr: %ld bytes\n", y * 8 + x, num_bytes);
955 printf("%ld bytes + %ld escape bits (%ld) = %ld total bytes\n",
956 tot_bytes - extra_bits / 8,