+ for (unsigned xb = 0; xb < WIDTH/2; xb += 8) {
+#if 0
+ // TF switch: Two 8x8 luma blocks -> one 16x8 block, then drop high frequencies
+ printf("in blocks:\n");
+ for (unsigned y = 0; y < 8; ++y) {
+ for (unsigned x = 0; x < 8; ++x) {
+ short a = coeff_y[(yb + y) * WIDTH + (xb*2 + x)];
+ printf(" %4d", a);
+ }
+ printf(" | ");
+ for (unsigned x = 0; x < 8; ++x) {
+ short b = coeff_y[(yb + y) * WIDTH + (xb*2 + x + 8)];
+ printf(" %4d", b);
+ }
+ printf("\n");
+ }
+
+ short in_y[64];
+ for (unsigned y = 0; y < 8; ++y) {
+ for (unsigned x = 0; x < 4; ++x) {
+ short a = coeff_y[(yb + y) * WIDTH + (xb*2 + x)];
+ short b = coeff_y[(yb + y) * WIDTH + (xb*2 + x + 8)];
+ b = a - b;
+ a = 2 * a - b;
+ in_y[y * 8 + x * 2 + 0] = a;
+ in_y[y * 8 + x * 2 + 1] = b;
+ }
+ }
+
+ printf("tf-ed block:\n");
+ for (unsigned y = 0; y < 8; ++y) {
+ for (unsigned x = 0; x < 8; ++x) {
+ short a = in_y[y * 8 + x];
+ printf(" %4d", a);
+ }
+ printf("\n");
+ }
+#else
+ // Read Y block with no tf switch (from reconstructed luma)
+ short in_y[64];
+ for (unsigned y = 0; y < 8; ++y) {
+ for (unsigned x = 0; x < 8; ++x) {
+ in_y[y * 8 + x] = pix_y[(yb + y) * (WIDTH) + (xb + x) * 2];
+ }
+ }
+ fdct_int32(in_y);
+#endif
+
+ // Read one block
+ short in_cb[64], in_cr[64];
+ for (unsigned y = 0; y < 8; ++y) {
+ for (unsigned x = 0; x < 8; ++x) {
+ in_cb[y * 8 + x] = pix_cb[(yb + y) * (WIDTH/2) + (xb + x)];
+ in_cr[y * 8 + x] = pix_cr[(yb + y) * (WIDTH/2) + (xb + x)];
+ }
+ }
+
+ // FDCT it
+ fdct_int32(in_cb);
+ fdct_int32(in_cr);
+
+#if 0
+ // Chroma from luma
+ double x0 = in_y[1];
+ double x1 = in_y[8];
+ double x2 = in_y[9];
+ double denom = (x0 * x0 + x1 * x1 + x2 * x2);
+ //double denom = (x1 * x1);
+
+ double y0 = in_cb[1];
+ double y1 = in_cb[8];
+ double y2 = in_cb[9];
+ double cb_cfl_fac = (x0 * y0 + x1 * y1 + x2 * y2) / denom;
+ //double cb_cfl_fac = (x1 * y1) / denom;
+
+ for (unsigned y = 0; y < 8; ++y) {
+ for (unsigned x = 0; x < 8; ++x) {
+ short a = in_y[y * 8 + x];
+ printf(" %4d", a);
+ }
+ printf(" | ");
+ for (unsigned x = 0; x < 8; ++x) {
+ short a = in_cb[y * 8 + x];
+ printf(" %4d", a);
+ }
+ printf("\n");
+ }
+ printf("(%d,%d,%d) -> (%d,%d,%d) gives %f\n",
+ in_y[1], in_y[8], in_y[9],
+ in_cb[1], in_cb[8], in_cb[9],
+ cb_cfl_fac);
+
+ y0 = in_cr[1];
+ y1 = in_cr[8];
+ y2 = in_cr[9];
+ double cr_cfl_fac = (x0 * y0 + x1 * y1 + x2 * y2) / denom;
+ //double cr_cfl_fac = (x1 * y1) / denom;
+ printf("cb CfL = %7.3f dc = %5d cr CfL = %7.3f dc = %d\n",
+ cb_cfl_fac, in_cb[0] - in_y[0],
+ cr_cfl_fac, in_cr[0] - in_y[0]);
+
+ if (denom == 0.0) { cb_cfl_fac = cr_cfl_fac = 0.0; }
+
+ // CHEAT
+ //last_cb_cfl_fac = cb_cfl_fac;
+ //last_cr_cfl_fac = cr_cfl_fac;
+
+ for (unsigned coeff_idx = 1; coeff_idx < 64; ++coeff_idx) {
+ //printf("%2d: cb = %3d prediction = %f * %3d = %7.3f\n", coeff_idx, in_cb[coeff_idx], last_cb_cfl_fac, in_y[coeff_idx], last_cb_cfl_fac * in_y[coeff_idx]);
+ //printf("%2d: cr = %3d prediction = %f * %3d = %7.3f\n", coeff_idx, in_cr[coeff_idx], last_cr_cfl_fac, in_y[coeff_idx], last_cr_cfl_fac * in_y[coeff_idx]);
+ double cb_pred = last_cb_cfl_fac * in_y[coeff_idx];
+ chroma_energy += in_cb[coeff_idx] * in_cb[coeff_idx];
+ chroma_energy_pred += (in_cb[coeff_idx] - cb_pred) * (in_cb[coeff_idx] - cb_pred);
+
+ //in_cb[coeff_idx] -= lrint(last_cb_cfl_fac * in_y[coeff_idx]);
+ //in_cr[coeff_idx] -= lrint(last_cr_cfl_fac * in_y[coeff_idx]);
+ //in_cr[coeff_idx] -= lrint(last_cr_cfl_fac * in_y[coeff_idx]);
+ //in_cb[coeff_idx] = lrint(in_y[coeff_idx] * (1.0 / sqrt(2)));
+ //in_cr[coeff_idx] = lrint(in_y[coeff_idx] * (1.0 / sqrt(2)));
+ //in_cb[coeff_idx] = lrint(in_y[coeff_idx]);
+ //in_cr[coeff_idx] = lrint(in_y[coeff_idx]);
+ }
+ //in_cb[0] += 1024;
+ //in_cr[0] += 1024;
+ //in_cb[0] -= in_y[0];
+ //in_cr[0] -= in_y[0];
+#endif
+
+ for (unsigned y = 0; y < 8; ++y) {
+ for (unsigned x = 0; x < 8; ++x) {
+ int coeff_idx = y * 8 + x;
+ int k_cb = quantize(in_cb[coeff_idx], coeff_idx);
+ coeff_cb[(yb + y) * (WIDTH/2) + (xb + x)] = k_cb;
+ int k_cr = quantize(in_cr[coeff_idx], coeff_idx);
+ coeff_cr[(yb + y) * (WIDTH/2) + (xb + x)] = k_cr;
+
+ // Store back for reconstruction / PSNR calculation
+ in_cb[coeff_idx] = unquantize(k_cb, coeff_idx);
+ in_cr[coeff_idx] = unquantize(k_cr, coeff_idx);
+ }
+ }
+
+ idct_int32(in_y); // DEBUG
+ idct_int32(in_cb);
+ idct_int32(in_cr);
+
+ for (unsigned y = 0; y < 8; ++y) {
+ for (unsigned x = 0; x < 8; ++x) {
+ pix_cb[(yb + y) * (WIDTH/2) + (xb + x)] = clamp(in_cb[y * 8 + x]);
+ pix_cr[(yb + y) * (WIDTH/2) + (xb + x)] = clamp(in_cr[y * 8 + x]);
+
+ // pix_cb[(yb + y) * (WIDTH/2) + (xb + x)] = in_y[y * 8 + x];
+ // pix_cr[(yb + y) * (WIDTH/2) + (xb + x)] = in_y[y * 8 + x];
+ }
+ }
+
+#if 0
+ last_cb_cfl_fac = cb_cfl_fac;
+ last_cr_cfl_fac = cr_cfl_fac;
+#endif
+ }
+ }
+
+#if 0
+ printf("chroma_energy = %f, with_pred = %f\n",
+ chroma_energy / (WIDTH * HEIGHT), chroma_energy_pred / (WIDTH * HEIGHT));
+#endif
+
+ // DC coefficient pred from the right to left (within each slice)
+ for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; block_idx += BLOCKS_PER_STREAM) {
+ int prev_k = 128;
+
+ for (unsigned subblock_idx = BLOCKS_PER_STREAM; subblock_idx --> 0; ) {
+ unsigned yb = (block_idx + subblock_idx) / WIDTH_BLOCKS;
+ unsigned xb = (block_idx + subblock_idx) % WIDTH_BLOCKS;
+ int k = coeff_y[(yb * 8) * WIDTH + (xb * 8)];
+
+ coeff_y[(yb * 8) * WIDTH + (xb * 8)] = k - prev_k;
+
+ prev_k = k;