+#define IDCT_DEQUANT_2X4_START \
+ int a0 = dct[0] + dct[1]; \
+ int a1 = dct[2] + dct[3]; \
+ int a2 = dct[4] + dct[5]; \
+ int a3 = dct[6] + dct[7]; \
+ int a4 = dct[0] - dct[1]; \
+ int a5 = dct[2] - dct[3]; \
+ int a6 = dct[4] - dct[5]; \
+ int a7 = dct[6] - dct[7]; \
+ int b0 = a0 + a1; \
+ int b1 = a2 + a3; \
+ int b2 = a4 + a5; \
+ int b3 = a6 + a7; \
+ int b4 = a0 - a1; \
+ int b5 = a2 - a3; \
+ int b6 = a4 - a5; \
+ int b7 = a6 - a7;
+
+static void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp )
+{
+ IDCT_DEQUANT_2X4_START
+ int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
+ dct4x4[0][0] = ((b0 + b1) * dmf + 32) >> 6;
+ dct4x4[1][0] = ((b2 + b3) * dmf + 32) >> 6;
+ dct4x4[2][0] = ((b0 - b1) * dmf + 32) >> 6;
+ dct4x4[3][0] = ((b2 - b3) * dmf + 32) >> 6;
+ dct4x4[4][0] = ((b4 - b5) * dmf + 32) >> 6;
+ dct4x4[5][0] = ((b6 - b7) * dmf + 32) >> 6;
+ dct4x4[6][0] = ((b4 + b5) * dmf + 32) >> 6;
+ dct4x4[7][0] = ((b6 + b7) * dmf + 32) >> 6;
+}
+
+static void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp )
+{
+ IDCT_DEQUANT_2X4_START
+ int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
+ dct[0] = ((b0 + b1) * dmf + 32) >> 6;
+ dct[1] = ((b2 + b3) * dmf + 32) >> 6;
+ dct[2] = ((b0 - b1) * dmf + 32) >> 6;
+ dct[3] = ((b2 - b3) * dmf + 32) >> 6;
+ dct[4] = ((b4 - b5) * dmf + 32) >> 6;
+ dct[5] = ((b6 - b7) * dmf + 32) >> 6;
+ dct[6] = ((b4 + b5) * dmf + 32) >> 6;
+ dct[7] = ((b6 + b7) * dmf + 32) >> 6;
+}
+
+static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x4( dctcoef out[8], dctcoef dct[8], int dmf )
+{
+ IDCT_DEQUANT_2X4_START
+ out[0] = ((b0 + b1) * dmf + 2080) >> 6; /* 2080 = 32 + (32<<6) */
+ out[1] = ((b2 + b3) * dmf + 2080) >> 6;
+ out[2] = ((b0 - b1) * dmf + 2080) >> 6;
+ out[3] = ((b2 - b3) * dmf + 2080) >> 6;
+ out[4] = ((b4 - b5) * dmf + 2080) >> 6;
+ out[5] = ((b6 - b7) * dmf + 2080) >> 6;
+ out[6] = ((b4 + b5) * dmf + 2080) >> 6;
+ out[7] = ((b6 + b7) * dmf + 2080) >> 6;
+}
+#undef IDCT_DEQUANT_2X4_START
+
+static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x2( dctcoef out[4], dctcoef dct[4], int dmf )
+{
+ int d0 = dct[0] + dct[1];
+ int d1 = dct[2] + dct[3];
+ int d2 = dct[0] - dct[1];
+ int d3 = dct[2] - dct[3];
+ out[0] = ((d0 + d1) * dmf >> 5) + 32;
+ out[1] = ((d0 - d1) * dmf >> 5) + 32;
+ out[2] = ((d2 + d3) * dmf >> 5) + 32;
+ out[3] = ((d2 - d3) * dmf >> 5) + 32;
+}
+
+static ALWAYS_INLINE int optimize_chroma_round( dctcoef *ref, dctcoef *dct, int dequant_mf, int chroma422 )
+{
+ dctcoef out[8];
+
+ if( chroma422 )
+ optimize_chroma_idct_dequant_2x4( out, dct, dequant_mf );
+ else
+ optimize_chroma_idct_dequant_2x2( out, dct, dequant_mf );
+
+ int sum = 0;
+ for( int i = 0; i < (chroma422?8:4); i++ )
+ sum |= ref[i] ^ out[i];
+ return sum >> 6;
+}
+
+static ALWAYS_INLINE int optimize_chroma_dc_internal( dctcoef *dct, int dequant_mf, int chroma422 )
+{
+ /* dequant_mf = h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << i_qp/6, max 32*64 */
+ dctcoef dct_orig[8];
+ int coeff, nz;
+
+ if( chroma422 )
+ optimize_chroma_idct_dequant_2x4( dct_orig, dct, dequant_mf );
+ else
+ optimize_chroma_idct_dequant_2x2( dct_orig, dct, dequant_mf );
+
+ /* If the DC coefficients already round to zero, terminate early. */
+ int sum = 0;
+ for( int i = 0; i < (chroma422?8:4); i++ )
+ sum |= dct_orig[i];
+ if( !(sum >> 6) )
+ return 0;
+
+ /* Start with the highest frequency coefficient... is this the best option? */
+ for( nz = 0, coeff = (chroma422?7:3); coeff >= 0; coeff-- )
+ {
+ int level = dct[coeff];
+ int sign = level>>31 | 1; /* dct[coeff] < 0 ? -1 : 1 */
+
+ while( level )
+ {
+ dct[coeff] = level - sign;
+ if( optimize_chroma_round( dct_orig, dct, dequant_mf, chroma422 ) )
+ {
+ nz = 1;
+ dct[coeff] = level;
+ break;
+ }
+ level -= sign;
+ }
+ }
+
+ return nz;
+}
+
+static int optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
+{
+ return optimize_chroma_dc_internal( dct, dequant_mf, 0 );
+}
+
+static int optimize_chroma_2x4_dc( dctcoef dct[8], int dequant_mf )
+{
+ return optimize_chroma_dc_internal( dct, dequant_mf, 1 );
+}
+