+/************************************************************************
+ * VP3 I/DCT
+ ************************************************************************/
+
+#define IdctAdjustBeforeShift 8
+#define xC1S7 64277
+#define xC2S6 60547
+#define xC3S5 54491
+#define xC4S4 46341
+#define xC5S3 36410
+#define xC6S2 25080
+#define xC7S1 12785
+
+void vp3_idct_c(int16_t *input_data, int16_t *dequant_matrix,
+ int16_t *output_data)
+{
+ int32_t intermediate_data[64];
+ int32_t *ip = intermediate_data;
+ int16_t *op = output_data;
+
+ int32_t A_, B_, C_, D_, _Ad, _Bd, _Cd, _Dd, E_, F_, G_, H_;
+ int32_t _Ed, _Gd, _Add, _Bdd, _Fd, _Hd;
+ int32_t t1, t2;
+
+ int i, j;
+
+ debug_idct("raw coefficient block:\n");
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ debug_idct(" %5d", input_data[i * 8 + j]);
+ }
+ debug_idct("\n");
+ }
+ debug_idct("\n");
+
+ for (i = 0; i < 64; i++) {
+ j = dezigzag_index[i];
+ intermediate_data[j] = dequant_matrix[i] * input_data[i];
+ }
+
+ debug_idct("dequantized block:\n");
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ debug_idct(" %5d", intermediate_data[i * 8 + j]);
+ }
+ debug_idct("\n");
+ }
+ debug_idct("\n");
+
+ /* Inverse DCT on the rows now */
+ for (i = 0; i < 8; i++) {
+ /* Check for non-zero values */
+ if ( ip[0] | ip[1] | ip[2] | ip[3] | ip[4] | ip[5] | ip[6] | ip[7] ) {
+ t1 = (int32_t)(xC1S7 * ip[1]);
+ t2 = (int32_t)(xC7S1 * ip[7]);
+ t1 >>= 16;
+ t2 >>= 16;
+ A_ = t1 + t2;
+
+ t1 = (int32_t)(xC7S1 * ip[1]);
+ t2 = (int32_t)(xC1S7 * ip[7]);
+ t1 >>= 16;
+ t2 >>= 16;
+ B_ = t1 - t2;
+
+ t1 = (int32_t)(xC3S5 * ip[3]);
+ t2 = (int32_t)(xC5S3 * ip[5]);
+ t1 >>= 16;
+ t2 >>= 16;
+ C_ = t1 + t2;
+
+ t1 = (int32_t)(xC3S5 * ip[5]);
+ t2 = (int32_t)(xC5S3 * ip[3]);
+ t1 >>= 16;
+ t2 >>= 16;
+ D_ = t1 - t2;
+
+
+ t1 = (int32_t)(xC4S4 * (A_ - C_));
+ t1 >>= 16;
+ _Ad = t1;
+
+ t1 = (int32_t)(xC4S4 * (B_ - D_));
+ t1 >>= 16;
+ _Bd = t1;
+
+
+ _Cd = A_ + C_;
+ _Dd = B_ + D_;
+
+ t1 = (int32_t)(xC4S4 * (ip[0] + ip[4]));
+ t1 >>= 16;
+ E_ = t1;
+
+ t1 = (int32_t)(xC4S4 * (ip[0] - ip[4]));
+ t1 >>= 16;
+ F_ = t1;
+
+ t1 = (int32_t)(xC2S6 * ip[2]);
+ t2 = (int32_t)(xC6S2 * ip[6]);
+ t1 >>= 16;
+ t2 >>= 16;
+ G_ = t1 + t2;
+
+ t1 = (int32_t)(xC6S2 * ip[2]);
+ t2 = (int32_t)(xC2S6 * ip[6]);
+ t1 >>= 16;
+ t2 >>= 16;
+ H_ = t1 - t2;
+
+
+ _Ed = E_ - G_;
+ _Gd = E_ + G_;
+
+ _Add = F_ + _Ad;
+ _Bdd = _Bd - H_;
+
+ _Fd = F_ - _Ad;
+ _Hd = _Bd + H_;
+
+ /* Final sequence of operations over-write original inputs. */
+ ip[0] = (int16_t)((_Gd + _Cd ) >> 0);
+ ip[7] = (int16_t)((_Gd - _Cd ) >> 0);
+
+ ip[1] = (int16_t)((_Add + _Hd ) >> 0);
+ ip[2] = (int16_t)((_Add - _Hd ) >> 0);
+
+ ip[3] = (int16_t)((_Ed + _Dd ) >> 0);
+ ip[4] = (int16_t)((_Ed - _Dd ) >> 0);
+
+ ip[5] = (int16_t)((_Fd + _Bdd ) >> 0);
+ ip[6] = (int16_t)((_Fd - _Bdd ) >> 0);
+
+ }
+
+ ip += 8; /* next row */
+ }
+
+ ip = intermediate_data;
+
+ for ( i = 0; i < 8; i++) {
+ /* Check for non-zero values (bitwise or faster than ||) */
+ if ( ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] |
+ ip[4 * 8] | ip[5 * 8] | ip[6 * 8] | ip[7 * 8] ) {
+
+ t1 = (int32_t)(xC1S7 * ip[1*8]);
+ t2 = (int32_t)(xC7S1 * ip[7*8]);
+ t1 >>= 16;
+ t2 >>= 16;
+ A_ = t1 + t2;
+
+ t1 = (int32_t)(xC7S1 * ip[1*8]);
+ t2 = (int32_t)(xC1S7 * ip[7*8]);
+ t1 >>= 16;
+ t2 >>= 16;
+ B_ = t1 - t2;
+
+ t1 = (int32_t)(xC3S5 * ip[3*8]);
+ t2 = (int32_t)(xC5S3 * ip[5*8]);
+ t1 >>= 16;
+ t2 >>= 16;
+ C_ = t1 + t2;
+
+ t1 = (int32_t)(xC3S5 * ip[5*8]);
+ t2 = (int32_t)(xC5S3 * ip[3*8]);
+ t1 >>= 16;
+ t2 >>= 16;
+ D_ = t1 - t2;
+
+
+ t1 = (int32_t)(xC4S4 * (A_ - C_));
+ t1 >>= 16;
+ _Ad = t1;
+
+ t1 = (int32_t)(xC4S4 * (B_ - D_));
+ t1 >>= 16;
+ _Bd = t1;
+
+
+ _Cd = A_ + C_;
+ _Dd = B_ + D_;
+
+ t1 = (int32_t)(xC4S4 * (ip[0*8] + ip[4*8]));
+ t1 >>= 16;
+ E_ = t1;
+
+ t1 = (int32_t)(xC4S4 * (ip[0*8] - ip[4*8]));
+ t1 >>= 16;
+ F_ = t1;
+
+ t1 = (int32_t)(xC2S6 * ip[2*8]);
+ t2 = (int32_t)(xC6S2 * ip[6*8]);
+ t1 >>= 16;
+ t2 >>= 16;
+ G_ = t1 + t2;
+
+ t1 = (int32_t)(xC6S2 * ip[2*8]);
+ t2 = (int32_t)(xC2S6 * ip[6*8]);
+ t1 >>= 16;
+ t2 >>= 16;
+ H_ = t1 - t2;
+
+
+ _Ed = E_ - G_;
+ _Gd = E_ + G_;
+
+ _Add = F_ + _Ad;
+ _Bdd = _Bd - H_;
+
+ _Fd = F_ - _Ad;
+ _Hd = _Bd + H_;
+
+ _Gd += IdctAdjustBeforeShift;
+ _Add += IdctAdjustBeforeShift;
+ _Ed += IdctAdjustBeforeShift;
+ _Fd += IdctAdjustBeforeShift;
+
+ /* Final sequence of operations over-write original inputs. */
+ op[0*8] = (int16_t)((_Gd + _Cd ) >> 4);
+ op[7*8] = (int16_t)((_Gd - _Cd ) >> 4);
+
+ op[1*8] = (int16_t)((_Add + _Hd ) >> 4);
+ op[2*8] = (int16_t)((_Add - _Hd ) >> 4);
+
+ op[3*8] = (int16_t)((_Ed + _Dd ) >> 4);
+ op[4*8] = (int16_t)((_Ed - _Dd ) >> 4);
+
+ op[5*8] = (int16_t)((_Fd + _Bdd ) >> 4);
+ op[6*8] = (int16_t)((_Fd - _Bdd ) >> 4);
+
+ } else {
+
+ op[0*8] = 0;
+ op[7*8] = 0;
+ op[1*8] = 0;
+ op[2*8] = 0;
+ op[3*8] = 0;
+ op[4*8] = 0;
+ op[5*8] = 0;
+ op[6*8] = 0;
+ }
+
+ ip++; /* next column */
+ op++;
+ }
+}
+
+void vp3_idct_put(int16_t *input_data, int16_t *dequant_matrix,
+ uint8_t *dest, int stride)
+{
+ int16_t transformed_data[64];
+ int16_t *op;
+ int i, j;
+
+ vp3_idct_c(input_data, dequant_matrix, transformed_data);
+
+ /* place in final output */
+ op = transformed_data;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ if (*op < -128)
+ *dest = 0;
+ else if (*op > 127)
+ *dest = 255;
+ else
+ *dest = (uint8_t)(*op + 128);
+ op++;
+ dest++;
+ }
+ dest += (stride - 8);
+ }
+}
+
+void vp3_idct_add(int16_t *input_data, int16_t *dequant_matrix,
+ uint8_t *dest, int stride)
+{
+ int16_t transformed_data[64];
+ int16_t *op;
+ int i, j;
+ int16_t sample;
+
+ vp3_idct_c(input_data, dequant_matrix, transformed_data);
+
+ /* place in final output */
+ op = transformed_data;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ sample = *dest + *op;
+ if (sample < 0)
+ *dest = 0;
+ else if (sample > 255)
+ *dest = 255;
+ else
+ *dest = (uint8_t)(sample & 0xFF);
+ op++;
+ dest++;
+ }
+ dest += (stride - 8);
+ }
+}
+