Merges Guillaume Poirier's AltiVec changes:

[x264] / common / quant.c
diff --git a/common/quant.c b/common/quant.c

index 437a135df5b6f60fd00ef561aa82f2545939c6d2..e7bd48ccdb24844201f4ef19bd5b8239ca00c8c6 100644 (file)
--- a/common/quant.c
+++ b/common/quant.c
@@ -22,13 +22,12 @@
  
  #include "common.h"
  
-void x264_quant_8x8_core16_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f );
-void x264_quant_4x4_core16_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f );
-void x264_quant_8x8_core32_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f );
-void x264_quant_4x4_core32_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f );
-void x264_quant_4x4_dc_core32_mmx( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f );
-void x264_quant_2x2_dc_core32_mmx( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f );
-
+#ifdef HAVE_MMXEXT
+#include "i386/quant.h"
+#endif
+#ifdef ARCH_PPC
+#   include "ppc/quant.h"
+#endif
  
  #define QUANT_ONE( coef, mf ) \
  { \
@@ -67,45 +66,231 @@ static void quant_2x2_dc_core( int16_t dct[2][2], int i_quant_mf, int i_qbits, i
      QUANT_ONE( dct[0][3], i_quant_mf );
  }
  
+#define DEQUANT_SHL( x ) \
+    dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][y][x] ) << i_qbits
+
+#define DEQUANT_SHR( x ) \
+    dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][y][x] + f ) >> (-i_qbits)
+
+static void dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+{
+    const int i_mf = i_qp%6;
+    const int i_qbits = i_qp/6 - 4;
+    int y;
+
+    if( i_qbits >= 0 )
+    {
+        for( y = 0; y < 4; y++ )
+        {
+            DEQUANT_SHL( 0 );
+            DEQUANT_SHL( 1 );
+            DEQUANT_SHL( 2 );
+            DEQUANT_SHL( 3 );
+        }
+    }
+    else
+    {
+        const int f = 1 << (-i_qbits-1);
+        for( y = 0; y < 4; y++ )
+        {
+            DEQUANT_SHR( 0 );
+            DEQUANT_SHR( 1 );
+            DEQUANT_SHR( 2 );
+            DEQUANT_SHR( 3 );
+        }
+    }
+}
+
+static void dequant_8x8( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp )
+{
+    const int i_mf = i_qp%6;
+    const int i_qbits = i_qp/6 - 6;
+    int y;
+
+    if( i_qbits >= 0 )
+    {
+        for( y = 0; y < 8; y++ )
+        {
+            DEQUANT_SHL( 0 );
+            DEQUANT_SHL( 1 );
+            DEQUANT_SHL( 2 );
+            DEQUANT_SHL( 3 );
+            DEQUANT_SHL( 4 );
+            DEQUANT_SHL( 5 );
+            DEQUANT_SHL( 6 );
+            DEQUANT_SHL( 7 );
+        }
+    }
+    else
+    {
+        const int f = 1 << (-i_qbits-1);
+        for( y = 0; y < 8; y++ )
+        {
+            DEQUANT_SHR( 0 );
+            DEQUANT_SHR( 1 );
+            DEQUANT_SHR( 2 );
+            DEQUANT_SHR( 3 );
+            DEQUANT_SHR( 4 );
+            DEQUANT_SHR( 5 );
+            DEQUANT_SHR( 6 );
+            DEQUANT_SHR( 7 );
+        }
+    }
+}
+
+void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qp )
+{
+    const int i_qbits = i_qp/6 - 5;
+
+    if( i_qbits >= 0 )
+    {
+        const int i_dmf = dequant_mf[i_qp%6][0][0] << i_qbits;
+        dct[0][0] *= i_dmf;
+        dct[0][1] *= i_dmf;
+        dct[1][0] *= i_dmf;
+        dct[1][1] *= i_dmf;
+    }
+    else
+    {
+        const int i_dmf = dequant_mf[i_qp%6][0][0];
+        // chroma DC is truncated, not rounded
+        dct[0][0] = ( dct[0][0] * i_dmf ) >> (-i_qbits);
+        dct[0][1] = ( dct[0][1] * i_dmf ) >> (-i_qbits);
+        dct[1][0] = ( dct[1][0] * i_dmf ) >> (-i_qbits);
+        dct[1][1] = ( dct[1][1] * i_dmf ) >> (-i_qbits);
+    }
+}
+
+void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+{
+    const int i_qbits = i_qp/6 - 6;
+    int y;
+
+    if( i_qbits >= 0 )
+    {
+        const int i_dmf = dequant_mf[i_qp%6][0][0] << i_qbits;
+
+        for( y = 0; y < 4; y++ )
+        {
+            dct[y][0] *= i_dmf;
+            dct[y][1] *= i_dmf;
+            dct[y][2] *= i_dmf;
+            dct[y][3] *= i_dmf;
+        }
+    }
+    else
+    {
+        const int i_dmf = dequant_mf[i_qp%6][0][0];
+        const int f = 1 << (-i_qbits-1);
+
+        for( y = 0; y < 4; y++ )
+        {
+            dct[y][0] = ( dct[y][0] * i_dmf + f ) >> (-i_qbits);
+            dct[y][1] = ( dct[y][1] * i_dmf + f ) >> (-i_qbits);
+            dct[y][2] = ( dct[y][2] * i_dmf + f ) >> (-i_qbits);
+            dct[y][3] = ( dct[y][3] * i_dmf + f ) >> (-i_qbits);
+        }
+    }
+}
  
  void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
  {
-    const char *name[4] = { "C", "C", "C", "C" };
+    int i, j, maxQ8=0, maxQ4=0, maxQdc=0;
  
      pf->quant_8x8_core = quant_8x8_core;
      pf->quant_4x4_core = quant_4x4_core;
      pf->quant_4x4_dc_core = quant_4x4_dc_core;
      pf->quant_2x2_dc_core = quant_2x2_dc_core;
  
+    pf->dequant_4x4 = dequant_4x4;
+    pf->dequant_8x8 = dequant_8x8;
+
+    /* determine the biggest coefficient in all quant8_mf tables */
+    for( j = 0; j < 2; j++ )
+        for( i = 0; i < 6*8*8; i++ )
+        {
+            int q = h->quant8_mf[j][0][0][i];
+            if( maxQ8 < q )
+                maxQ8 = q;
+        }
+
+    /* determine the biggest coefficient in all quant4_mf tables ( maxQ4 )
+       and the biggest DC coefficient if all quant4_mf tables ( maxQdc ) */
+    for( j = 0; j < 4; j++ )
+        for( i = 0; i < 6*4*4; i++ )
+        {
+            int q = h->quant4_mf[j][0][0][i];
+            if( maxQ4 < q )
+                maxQ4 = q;
+            if( maxQdc < q && i%16 == 0 )
+                maxQdc = q;
+        }
+
  #ifdef HAVE_MMXEXT
-    if( cpu&X264_CPU_MMX )
+
+    /* select quant_8x8 based on CPU and maxQ8 */
+    if( maxQ8 < (1<<15) && cpu&X264_CPU_MMX )
+        pf->quant_8x8_core = x264_quant_8x8_core15_mmx;
+    else
+    if( maxQ8 < (1<<16) && cpu&X264_CPU_MMXEXT )
+        pf->quant_8x8_core = x264_quant_8x8_core16_mmxext;
+    else
+    if( cpu&X264_CPU_MMXEXT )
+        pf->quant_8x8_core = x264_quant_8x8_core32_mmxext;
+
+    /* select quant_4x4 based on CPU and maxQ4 */
+    if( maxQ4 < (1<<15) && cpu&X264_CPU_MMX )
+        pf->quant_4x4_core = x264_quant_4x4_core15_mmx;
+    else
+    if( maxQ4 < (1<<16) && cpu&X264_CPU_MMXEXT )
+        pf->quant_4x4_core = x264_quant_4x4_core16_mmxext;
+    else
+    if( cpu&X264_CPU_MMXEXT )
+        pf->quant_4x4_core = x264_quant_4x4_core32_mmxext;
+
+    /* select quant_XxX_dc based on CPU and maxQdc */
+    if( maxQdc < (1<<16) && cpu&X264_CPU_MMXEXT )
      {
-        int i;
-
-        pf->quant_8x8_core = x264_quant_8x8_core16_mmx;
-        pf->quant_4x4_core = x264_quant_4x4_core16_mmx;
-        pf->quant_4x4_dc_core = x264_quant_4x4_dc_core32_mmx;
-        pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmx;
-
-        name[0] = name[1] = "16MMX";
-        name[2] = name[3] = "32MMX";
-
-        for( i = 0; i < 2*6*8*8; i++ )
-            if( (***h->quant8_mf)[i] >= 0x8000 )
-            {
-                pf->quant_8x8_core = x264_quant_8x8_core32_mmx;
-                name[0] = "32MMX";
-            }
-
-        for( i = 0; i < 4*6*4*4; i++ )
-            if( (***h->quant4_mf)[i] >= 0x8000 )
-            {
-                pf->quant_4x4_core = x264_quant_4x4_core32_mmx;
-                name[1] = "32MMX";
-            }
+        pf->quant_4x4_dc_core = x264_quant_4x4_dc_core16_mmxext;
+        pf->quant_2x2_dc_core = x264_quant_2x2_dc_core16_mmxext;
+    }
+    else
+    if( maxQdc < (1<<15) && cpu&X264_CPU_MMX )
+    {
+        pf->quant_4x4_dc_core = x264_quant_4x4_dc_core15_mmx;
+        pf->quant_2x2_dc_core = x264_quant_2x2_dc_core15_mmx;
+    }
+    else
+    if( cpu&X264_CPU_MMXEXT )
+    {
+        pf->quant_4x4_dc_core = x264_quant_4x4_dc_core32_mmxext;
+        pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmxext;
      }
-#endif
  
-    x264_log( h, X264_LOG_DEBUG, "using quant functions 8x8=%s 4x4=%s dc4x4=%s dc2x2=%s\n",
-              name[0], name[1], name[2], name[3] );
+    if( cpu&X264_CPU_MMX )
+    {
+        /* dequant is not subject to the above CQM-dependent overflow issues,
+         * as long as the inputs are in the range generable by dct+quant.
+         * that is not guaranteed by the standard, but is true within x264 */
+        pf->dequant_4x4 = x264_dequant_4x4_mmx;
+        pf->dequant_8x8 = x264_dequant_8x8_mmx;
+    }
+#endif  /* HAVE_MMXEXT */
+    
+#ifdef ARCH_PPC
+    if( cpu&X264_CPU_ALTIVEC ) {
+        if( maxQ8 < (1<<16) )
+        {
+            pf->quant_8x8_core = x264_quant_8x8_altivec;
+        }
+        if( maxQ4 < (1<<16) )
+        {
+            pf->quant_4x4_core = x264_quant_4x4_altivec;
+        }
+        if( maxQdc < (1<<16) )
+        {
+           pf->quant_4x4_dc_core = x264_quant_4x4_dc_altivec;
+        }
+    }
+#endif /* ARCH_PPC */
  }