Massive overhaul of nnz/cbp calculation

author Fiona Glaser <fiona@x264.com>

Fri, 30 Jan 2009 11:40:54 +0000 (03:40 -0800)

committer Fiona Glaser <fiona@x264.com>

Fri, 30 Jan 2009 12:11:24 +0000 (04:11 -0800)
author Fiona Glaser <fiona@x264.com>
Fri, 30 Jan 2009 11:40:54 +0000 (03:40 -0800)
committer Fiona Glaser <fiona@x264.com>
Fri, 30 Jan 2009 12:11:24 +0000 (04:11 -0800)
diff --git a/common/common.h b/common/common.h

index 78b1efb64384b4f7bdb0a44e03f12d664fa51f71..97c687812e7f0aca1f26b0017b53ff5e59876107 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -471,6 +471,10 @@ struct x264_t
              DECLARE_ALIGNED_16( uint8_t i8x8_fdec_buf[16*16] );
              DECLARE_ALIGNED_16( int16_t i8x8_dct_buf[3][64] );
              DECLARE_ALIGNED_16( int16_t i4x4_dct_buf[15][16] );
+            uint32_t i4x4_nnz_buf[4];
+            uint32_t i8x8_nnz_buf[4];
+            int i4x4_cbp;
+            int i8x8_cbp;
  
              /* Psy trellis DCT data */
              DECLARE_ALIGNED_16( int16_t fenc_dct8[4][64] );
diff --git a/common/dct.c b/common/dct.c

index f8d51e40feb3312ed1e74922fd4440a1293dd905..5f9f0fb0650da2e5f65b036697af0c19d75d97a1 100644 (file)
--- a/common/dct.c
+++ b/common/dct.c
@@ -369,6 +369,18 @@ static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[2][2] )
      add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[1][1] );
  }
  
+static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[4][4] )
+{
+    int i;
+    for( i = 0; i < 4; i++, p_dst += 4*FDEC_STRIDE )
+    {
+        add4x4_idct_dc( &p_dst[ 0], dct[i][0] );
+        add4x4_idct_dc( &p_dst[ 4], dct[i][1] );
+        add4x4_idct_dc( &p_dst[ 8], dct[i][2] );
+        add4x4_idct_dc( &p_dst[12], dct[i][3] );
+    }
+}
+
  
  /****************************************************************************
   * x264_dct_init:
@@ -384,6 +396,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
  
      dctf->sub16x16_dct  = sub16x16_dct;
      dctf->add16x16_idct = add16x16_idct;
+    dctf->add16x16_idct_dc = add16x16_idct_dc;
  
      dctf->sub8x8_dct8   = sub8x8_dct8;
      dctf->add8x8_idct8  = add8x8_idct8;
@@ -400,6 +413,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
          dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
          dctf->add4x4_idct   = x264_add4x4_idct_mmx;
          dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
+        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
          dctf->dct4x4dc      = x264_dct4x4dc_mmx;
          dctf->idct4x4dc     = x264_idct4x4dc_mmx;
  
@@ -427,10 +441,14 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
          dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
          dctf->add8x8_idct   = x264_add8x8_idct_sse2;
          dctf->add16x16_idct = x264_add16x16_idct_sse2;
+        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
      }
  
      if( cpu&X264_CPU_SSSE3 )
+    {
          dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
+        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
+    }
  #endif //HAVE_MMX
  
  #ifdef ARCH_PPC
diff --git a/common/dct.h b/common/dct.h

index f4474fcc00078c787547e5937587dc267c744d50..71951f9b1b3fd19216eb7296fa897a4306c0a478 100644 (file)
--- a/common/dct.h
+++ b/common/dct.h
@@ -100,6 +100,7 @@ typedef struct
  
      void (*sub16x16_dct) ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
      void (*add16x16_idct)( uint8_t *p_dst, int16_t dct[16][4][4] );
+    void (*add16x16_idct_dc) ( uint8_t *p_dst, int16_t dct[4][4] );
  
      void (*sub8x8_dct8)  ( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 );
      void (*add8x8_idct8) ( uint8_t *p_dst, int16_t dct[8][8] );
diff --git a/common/ppc/quant.c b/common/ppc/quant.c

index 64b34ab5936674d5ccc4b111a8953bcc5526dd56..d1d9d72ab68a4670f69dad86ca7237f2caf51e33 100644 (file)
--- a/common/ppc/quant.c
+++ b/common/ppc/quant.c
@@ -30,10 +30,10 @@ mfvA = vec_ld((idx0), mf);                                                   \
  mfvB = vec_ld((idx1), mf);                                                   \
  biasvA = vec_ld((idx0), bias);                                               \
  biasvB = vec_ld((idx1), bias);                                               \
-mskA = vec_cmplt(temp1v, zerov);                                             \
-mskB = vec_cmplt(temp2v, zerov);                                             \
-coefvA = (vec_u16_t)vec_max(vec_sub(zerov, temp1v), temp1v);                 \
-coefvB = (vec_u16_t)vec_max(vec_sub(zerov, temp2v), temp2v);                 \
+mskA = vec_cmplt(temp1v, zero_s16v);                                         \
+mskB = vec_cmplt(temp2v, zero_s16v);                                         \
+coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);             \
+coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);             \
  coefvA = vec_adds(coefvA, biasvA);                                           \
  coefvB = vec_adds(coefvB, biasvB);                                           \
  multEvenvA = vec_mule(coefvA, mfvA);                                         \
@@ -51,17 +51,20 @@ temp2v = vec_xor(temp2v, mskB);                                              \
  temp1v = vec_adds(temp1v, vec_and(mskA, one));                               \
  vec_st(temp1v, (idx0), (int16_t*)dct);                                       \
  temp2v = vec_adds(temp2v, vec_and(mskB, one));                               \
+nz = vec_or(nz, vec_or(temp1v, temp2v));                                     \
  vec_st(temp2v, (idx1), (int16_t*)dct);
                  
-void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
+int x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
  {
+    LOAD_ZERO;
      vector bool short mskA;
      vec_u32_t i_qbitsv;
      vec_u16_t coefvA;
      vec_u32_t multEvenvA, multOddvA;
      vec_u16_t mfvA;
      vec_u16_t biasvA;
-    vec_s16_t zerov, one;
+    vec_s16_t one = vec_splat_s16(1);;
+    vec_s16_t nz = zero_s16v;
  
      vector bool short mskB;
      vec_u16_t coefvB;
@@ -75,20 +78,18 @@ void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[1
      qbits_u.s[0]=16;
      i_qbitsv = vec_splat(qbits_u.v, 0);
  
-    zerov = vec_splat_s16(0);
-    one = vec_splat_s16(1);
-
      QUANT_16_U( 0, 16 );
+    return vec_any_ne(nz, zero_s16v);
  }
  
  // DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
  #define QUANT_16_U_DC( idx0, idx1 )                             \
  temp1v = vec_ld((idx0), *dct);                                  \
  temp2v = vec_ld((idx1), *dct);                                  \
-mskA = vec_cmplt(temp1v, zerov);                                \
-mskB = vec_cmplt(temp2v, zerov);                                \
-coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v);   \
-coefvB = (vec_u16_t) vec_max(vec_sub(zerov, temp2v), temp2v);   \
+mskA = vec_cmplt(temp1v, zero_s16v);                            \
+mskB = vec_cmplt(temp2v, zero_s16v);                            \
+coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
+coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
  coefvA = vec_add(coefvA, biasv);                                \
  coefvB = vec_add(coefvB, biasv);                                \
  multEvenvA = vec_mule(coefvA, mfv);                             \
@@ -106,15 +107,18 @@ temp2v = vec_xor(temp2v, mskB);                                 \
  temp1v = vec_add(temp1v, vec_and(mskA, one));                   \
  vec_st(temp1v, (idx0), (int16_t*)dct);                          \
  temp2v = vec_add(temp2v, vec_and(mskB, one));                   \
+nz = vec_or(nz, vec_or(temp1v, temp2v));                        \
  vec_st(temp2v, (idx1), (int16_t*)dct);
  
-void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias )
+int x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias )
  {
+    LOAD_ZERO;
      vector bool short mskA;
      vec_u32_t i_qbitsv;
      vec_u16_t coefvA;
      vec_u32_t multEvenvA, multOddvA;
-    vec_s16_t zerov, one;
+    vec_s16_t one = vec_splat_s16(1);
+    vec_s16_t nz = zero_s16v;
  
      vector bool short mskB;
      vec_u16_t coefvB;
@@ -137,18 +141,16 @@ void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias )
      bias_u.s[0]=bias;
      biasv = vec_splat(bias_u.v, 0);
  
-    zerov = vec_splat_s16(0);
-    one = vec_splat_s16(1);
-
      QUANT_16_U_DC( 0, 16 );
+    return vec_any_ne(nz, zero_s16v);
  }
  
  // DC quant of a whole 2x2 block
  #define QUANT_4_U_DC( idx0 )                                    \
  const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0);      \
  temp1v = vec_ld((idx0), *dct);                                  \
-mskA = vec_cmplt(temp1v, zerov);                                \
-coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v);   \
+mskA = vec_cmplt(temp1v, zero_s16v);                            \
+coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
  coefvA = vec_add(coefvA, biasv);                                \
  multEvenvA = vec_mule(coefvA, mfv);                             \
  multOddvA = vec_mulo(coefvA, mfv);                              \
@@ -158,15 +160,18 @@ temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(mul
  temp2v = vec_xor(temp2v, mskA);                                 \
  temp2v = vec_add(temp2v, vec_and(mskA, one));                   \
  temp1v = vec_sel(temp1v, temp2v, sel);                          \
+nz = vec_or(nz, temp1v);                                        \
  vec_st(temp1v, (idx0), (int16_t*)dct);
  
-void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias )
+int x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias )
  {
+    LOAD_ZERO;
      vector bool short mskA;
      vec_u32_t i_qbitsv;
      vec_u16_t coefvA;
      vec_u32_t multEvenvA, multOddvA;
-    vec_s16_t zerov, one;
+    vec_s16_t one = vec_splat_s16(1);
+    vec_s16_t nz = zero_s16v;
  
      vec_s16_t temp1v, temp2v;
  
@@ -185,42 +190,41 @@ void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias )
      bias_u.s[0]=bias;
      biasv = vec_splat(bias_u.v, 0);
  
-    zerov = vec_splat_s16(0);
-    one = vec_splat_s16(1);
-
+    static const vec_s16_t mask2 = CV(-1, -1, -1, -1,  0, 0, 0, 0);
      QUANT_4_U_DC(0);
+    return vec_any_ne(vec_and(nz, mask2), zero_s16v);
  }
  
-void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
+int x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
  {
+    LOAD_ZERO;
      vector bool short mskA;
      vec_u32_t i_qbitsv;
      vec_u16_t coefvA;
      vec_u32_t multEvenvA, multOddvA;
      vec_u16_t mfvA;
      vec_u16_t biasvA;
-    vec_s16_t zerov, one;
-    
+    vec_s16_t one = vec_splat_s16(1);;
+    vec_s16_t nz = zero_s16v;
+
      vector bool short mskB;
      vec_u16_t coefvB;
      vec_u32_t multEvenvB, multOddvB;
      vec_u16_t mfvB;
      vec_u16_t biasvB;
-    
+
      vec_s16_t temp1v, temp2v;
      
      vec_u32_u qbits_u;
      qbits_u.s[0]=16;
      i_qbitsv = vec_splat(qbits_u.v, 0);
-
-    zerov = vec_splat_s16(0);
-    one = vec_splat_s16(1);
      
      int i;
  
      for ( i=0; i<4; i++ ) {
        QUANT_16_U( i*2*16, i*2*16+16 );
      }
+    return vec_any_ne(nz, zero_s16v);
  }
  
  #define DEQUANT_SHL()                                                \
diff --git a/common/ppc/quant.h b/common/ppc/quant.h

index 0504900384561fdd70a5881450f2222ba6b89d2d..f55a934afa222e7866481a6df0a0316d25f50094 100644 (file)
--- a/common/ppc/quant.h
+++ b/common/ppc/quant.h
@@ -21,11 +21,11 @@
  #ifndef X264_PPC_QUANT_H
  #define X264_PPC_QUANT_H
  
-void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
  
-void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias );
-void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias );
+int x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias );
+int x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias );
  
  void x264_dequant_4x4_altivec( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
  void x264_dequant_8x8_altivec( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
diff --git a/common/quant.c b/common/quant.c

index ac798a25a8d58f901245079623b29845bf42e4ed..daf2b5a20d350ccd572b9f68ec72f2dc56151ab3 100644 (file)
--- a/common/quant.c
+++ b/common/quant.c
@@ -36,35 +36,41 @@
          (coef) = (f + (coef)) * (mf) >> 16; \
      else \
          (coef) = - ((f - (coef)) * (mf) >> 16); \
+    nz |= (coef); \
  }
  
-static void quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
+static int quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
  {
-    int i;
+    int i, nz = 0;
      for( i = 0; i < 64; i++ )
          QUANT_ONE( dct[0][i], mf[i], bias[i] );
+    return !!nz;
  }
  
-static void quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
+static int quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
  {
-    int i;
+    int i, nz = 0;
      for( i = 0; i < 16; i++ )
          QUANT_ONE( dct[0][i], mf[i], bias[i] );
+    return !!nz;
  }
  
-static void quant_4x4_dc( int16_t dct[4][4], int mf, int bias )
+static int quant_4x4_dc( int16_t dct[4][4], int mf, int bias )
  {
-    int i;
+    int i, nz = 0;
      for( i = 0; i < 16; i++ )
          QUANT_ONE( dct[0][i], mf, bias );
+    return !!nz;
  }
  
-static void quant_2x2_dc( int16_t dct[2][2], int mf, int bias )
+static int quant_2x2_dc( int16_t dct[2][2], int mf, int bias )
  {
+    int nz = 0;
      QUANT_ONE( dct[0][0], mf, bias );
      QUANT_ONE( dct[0][1], mf, bias );
      QUANT_ONE( dct[0][2], mf, bias );
      QUANT_ONE( dct[0][3], mf, bias );
+    return !!nz;
  }
  
  #define DEQUANT_SHL( x ) \
@@ -402,6 +408,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
          pf->decimate_score16 = x264_decimate_score16_ssse3;
          pf->decimate_score64 = x264_decimate_score64_ssse3;
      }
+
+    if( cpu&X264_CPU_SSE4 )
+    {
+        pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
+        pf->quant_4x4 = x264_quant_4x4_sse4;
+        pf->quant_8x8 = x264_quant_8x8_sse4;
+    }
  #endif // HAVE_MMX
  
  #ifdef ARCH_PPC
diff --git a/common/quant.h b/common/quant.h

index eaac5937c09f13efa07e7811b851ef2fcf2900ef..b8a7b988aa9ae14bfc6bfd0bdc9a94f062db2133 100644 (file)
--- a/common/quant.h
+++ b/common/quant.h
@@ -25,10 +25,10 @@
  
  typedef struct
  {
-    void (*quant_8x8)( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
-    void (*quant_4x4)( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-    void (*quant_4x4_dc)( int16_t dct[4][4], int mf, int bias );
-    void (*quant_2x2_dc)( int16_t dct[2][2], int mf, int bias );
+    int (*quant_8x8)( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+    int (*quant_4x4)( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+    int (*quant_4x4_dc)( int16_t dct[4][4], int mf, int bias );
+    int (*quant_2x2_dc)( int16_t dct[2][2], int mf, int bias );
  
      void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
      void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm

index a474932fe40b703ca317759ecfe709ae97899b7e..156a7ae41c9df20f88a1d475259b5612a6c9aef6 100644 (file)
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -33,6 +33,7 @@ pb_sub4frame:   db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
  pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
  pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
  pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
+pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
  
  SECTION .text
  
@@ -324,6 +325,104 @@ cglobal x264_add8x8_idct_dc_ssse3, 2,2
      movhps    [r0+FDEC_STRIDE* 3], xmm5
      ret
  
+cglobal x264_add16x16_idct_dc_mmx, 2,3
+    mov       r2, 4
+.loop:
+    movq      mm0, [r1]
+    pxor      mm1, mm1
+    paddw     mm0, [pw_32 GLOBAL]
+    psraw     mm0, 6
+    psubw     mm1, mm0
+    packuswb  mm0, mm0
+    packuswb  mm1, mm1
+    punpcklbw mm0, mm0
+    punpcklbw mm1, mm1
+    pshufw    mm2, mm0, 0xFA
+    pshufw    mm3, mm1, 0xFA
+    punpcklbw mm0, mm0
+    punpcklbw mm1, mm1
+    ADD_DC    mm0, mm1, r0
+    ADD_DC    mm2, mm3, r0+8
+    add       r1, 8
+    add       r0, FDEC_STRIDE*4
+    dec       r2
+    jg .loop
+    ret
+
+%macro IDCT_DC_STORE 3
+    movdqa    xmm4, [r0+%1+FDEC_STRIDE*0]
+    movdqa    xmm5, [r0+%1+FDEC_STRIDE*1]
+    movdqa    xmm6, [r0+%1+FDEC_STRIDE*2]
+    movdqa    xmm7, [r0+%1+FDEC_STRIDE*3]
+    paddusb   xmm4, %2
+    paddusb   xmm5, %2
+    paddusb   xmm6, %2
+    paddusb   xmm7, %2
+    psubusb   xmm4, %3
+    psubusb   xmm5, %3
+    psubusb   xmm6, %3
+    psubusb   xmm7, %3
+    movdqa    [r0+%1+FDEC_STRIDE*0], xmm4
+    movdqa    [r0+%1+FDEC_STRIDE*1], xmm5
+    movdqa    [r0+%1+FDEC_STRIDE*2], xmm6
+    movdqa    [r0+%1+FDEC_STRIDE*3], xmm7
+%endmacro
+
+cglobal x264_add16x16_idct_dc_sse2, 2,2
+    call .loop
+    add       r0, FDEC_STRIDE*4
+.loop:
+    add       r0, FDEC_STRIDE*4
+    movq      xmm0, [r1+0]
+    movq      xmm2, [r1+8]
+    add       r1, 16
+    punpcklwd xmm0, xmm0
+    punpcklwd xmm2, xmm2
+    pxor      xmm1, xmm1
+    pxor      xmm3, xmm3
+    paddw     xmm0, [pw_32 GLOBAL]
+    paddw     xmm2, [pw_32 GLOBAL]
+    psraw     xmm0, 6
+    psraw     xmm2, 6
+    psubw     xmm1, xmm0
+    psubw     xmm3, xmm2
+    packuswb  xmm0, xmm1
+    packuswb  xmm2, xmm3
+    movdqa    xmm1, xmm0
+    movdqa    xmm3, xmm2
+    punpcklbw xmm0, xmm0
+    punpcklbw xmm2, xmm2
+    punpckhbw xmm1, xmm1
+    punpckhbw xmm3, xmm3
+    IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
+    IDCT_DC_STORE 0, xmm2, xmm3
+    ret
+
+cglobal x264_add16x16_idct_dc_ssse3, 2,2
+    call .loop
+    add       r0, FDEC_STRIDE*4
+.loop:
+    add       r0, FDEC_STRIDE*4
+    movdqa    xmm0, [r1]
+    add       r1, 16
+    pxor      xmm1, xmm1
+    paddw     xmm0, [pw_32 GLOBAL]
+    psraw     xmm0, 6
+    psubw     xmm1, xmm0
+    movdqa    xmm5, [ pb_idctdc_unpack GLOBAL]
+    movdqa    xmm6, [pb_idctdc_unpack2 GLOBAL]
+    packuswb  xmm0, xmm0
+    packuswb  xmm1, xmm1
+    movdqa    xmm2, xmm0
+    movdqa    xmm3, xmm1
+    pshufb    xmm0, xmm5
+    pshufb    xmm2, xmm6
+    pshufb    xmm1, xmm5
+    pshufb    xmm3, xmm6
+    IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
+    IDCT_DC_STORE 0, xmm2, xmm3
+    ret
+
  ;-----------------------------------------------------------------------------
  ; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
  ;-----------------------------------------------------------------------------
diff --git a/common/x86/dct.h b/common/x86/dct.h

index d30fa97274c25733dd3ea795aeb89cd4d31895aa..99392761b7ab0d693170a7e77cfff53188302f13 100644 (file)
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -34,9 +34,12 @@ void x264_add4x4_idct_mmx    ( uint8_t *p_dst, int16_t dct[ 4][4]    );
  void x264_add8x8_idct_mmx    ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
  void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[2][2] );
  void x264_add16x16_idct_mmx  ( uint8_t *p_dst, int16_t dct[16][4][4] );
+void x264_add16x16_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[4][4] );
  void x264_add8x8_idct_sse2   ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
  void x264_add16x16_idct_sse2 ( uint8_t *p_dst, int16_t dct[16][4][4] );
+void x264_add16x16_idct_dc_sse2( uint8_t *p_dst, int16_t dct[4][4] );
  void x264_add8x8_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[2][2] );
+void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[4][4] );
  
  void x264_dct4x4dc_mmx       ( int16_t d[4][4] );
  void x264_idct4x4dc_mmx      ( int16_t d[4][4] );
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm

index d1fd8693f1ebf80c608787e74ef0a565b963435c..d1b39919abc4b0f60f69f89cacb6e5b8296ec188 100644 (file)
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -29,6 +29,7 @@ SECTION_RODATA
  pb_1:     times 16 db 1
  pw_1:     times 8 dw 1
  pd_1:     times 4 dd 1
+pb_01:    times 8 db 0, 1
  
  %macro DQM4 3
      dw %1, %2, %1, %2, %2, %3, %2, %3
@@ -70,7 +71,7 @@ decimate_mask_table4:
  
  SECTION .text
  
-%macro QUANT_DC_START 0
+%macro QUANT_DC_START_MMX 0
      movd       m6, r1m     ; mf
      movd       m7, r2m     ; bias
  %ifidn m0, mm0
@@ -84,6 +85,14 @@ SECTION .text
  %endif
  %endmacro
  
+%macro QUANT_DC_START_SSSE3 0
+    movdqa     m5, [pb_01 GLOBAL]
+    movd       m6, r1m     ; mf
+    movd       m7, r2m     ; bias
+    pshufb     m6, m5
+    pshufb     m7, m5
+%endmacro
+
  %macro PABSW_MMX 2
      pxor       %1, %1
      pcmpgtw    %1, %2
@@ -105,7 +114,7 @@ SECTION .text
      psignw     %1, %2
  %endmacro
  
-%macro QUANT_ONE 3
+%macro QUANT_ONE 4
  ;;; %1      (m64)       dct[y][x]
  ;;; %2      (m64/mmx)   mf[y][x] or mf[0][0] (as uint16_t)
  ;;; %3      (m64/mmx)   bias[y][x] or bias[0][0] (as uint16_t)
@@ -115,6 +124,62 @@ SECTION .text
      pmulhuw    m0, %2   ; divide
      PSIGNW     m0, m1   ; restore sign
      mova       %1, m0   ; store
+%if %4
+    por        m5, m0
+%else
+    SWAP       m5, m0
+%endif
+%endmacro
+
+%macro QUANT_TWO 7
+    mova       m1, %1
+    mova       m3, %2
+    PABSW      m0, m1
+    PABSW      m2, m3
+    paddusw    m0, %5
+    paddusw    m2, %6
+    pmulhuw    m0, %3
+    pmulhuw    m2, %4
+    PSIGNW     m0, m1
+    PSIGNW     m2, m3
+    mova       %1, m0
+    mova       %2, m2
+%if %7
+    por        m5, m0
+    por        m5, m2
+%else
+    SWAP       m5, m0
+    por        m5, m2
+%endif
+%endmacro
+
+%macro QUANT_END_MMX 0
+    xor      eax, eax
+%ifndef ARCH_X86_64
+%if mmsize==8
+    packsswb  m5, m5
+    movd     ecx, m5
+    test     ecx, ecx
+%else
+    pxor      m4, m4
+    pcmpeqb   m5, m4
+    pmovmskb ecx, m5
+    cmp      ecx, (1<<mmsize)-1
+%endif
+%else
+%if mmsize==16
+    packsswb  m5, m5
+%endif
+    movq     rcx, m5
+    test     rcx, rcx
+%endif
+    setne     al
+%endmacro
+
+%macro QUANT_END_SSE4 0
+    xor      eax, eax
+    ptest     m5, m5
+    setne     al
  %endmacro
  
  ;-----------------------------------------------------------------------------
@@ -123,30 +188,38 @@ SECTION .text
  %macro QUANT_DC 2
  cglobal %1, 1,1
      QUANT_DC_START
+%if %2==1
+    QUANT_ONE [r0], m6, m7, 0
+%else
  %assign x 0
-%rep %2
-    QUANT_ONE [r0+x], m6, m7
-%assign x x+mmsize
+%rep %2/2
+    QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x
+%assign x x+mmsize*2
  %endrep
+%endif
+    QUANT_END
      RET
  %endmacro
  
  ;-----------------------------------------------------------------------------
-; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
+; int x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
  ;-----------------------------------------------------------------------------
  %macro QUANT_AC 2
  cglobal %1, 3,3
  %assign x 0
-%rep %2
-    QUANT_ONE [r0+x], [r1+x], [r2+x]
-%assign x x+mmsize
+%rep %2/2
+    QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x
+%assign x x+mmsize*2
  %endrep
+    QUANT_END
      RET
  %endmacro
  
  INIT_MMX
+%define QUANT_END QUANT_END_MMX
  %define PABSW PABSW_MMX
  %define PSIGNW PSIGNW_MMX
+%define QUANT_DC_START QUANT_DC_START_MMX
  QUANT_DC x264_quant_2x2_dc_mmxext, 1
  %ifndef ARCH_X86_64 ; not needed because sse2 is faster
  QUANT_DC x264_quant_4x4_dc_mmxext, 4
@@ -167,6 +240,13 @@ QUANT_AC x264_quant_8x8_ssse3, 8
  
  INIT_MMX
  QUANT_DC x264_quant_2x2_dc_ssse3, 1
+%define QUANT_END QUANT_END_SSE4
+;Not faster on Conroe, so only used in SSE4 versions
+%define QUANT_DC_START QUANT_DC_START_SSSE3
+INIT_XMM
+QUANT_DC x264_quant_4x4_dc_sse4, 2
+QUANT_AC x264_quant_4x4_sse4, 2
+QUANT_AC x264_quant_8x8_sse4, 8
  
  
  
diff --git a/common/x86/quant.h b/common/x86/quant.h

index 878699f916775e6fc6f772d9c6a5fea204f29acf..dff60a85906b5c470e5a636503188acee8e53058 100644 (file)
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -24,17 +24,20 @@
  #ifndef X264_I386_QUANT_H
  #define X264_I386_QUANT_H
  
-void x264_quant_2x2_dc_mmxext( int16_t dct[2][2], int mf, int bias );
-void x264_quant_4x4_dc_mmxext( int16_t dct[4][4], int mf, int bias );
-void x264_quant_4x4_mmx( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-void x264_quant_8x8_mmx( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
-void x264_quant_4x4_dc_sse2( int16_t dct[4][4], int mf, int bias );
-void x264_quant_4x4_sse2( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-void x264_quant_8x8_sse2( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
-void x264_quant_2x2_dc_ssse3( int16_t dct[2][2], int mf, int bias );
-void x264_quant_4x4_dc_ssse3( int16_t dct[4][4], int mf, int bias );
-void x264_quant_4x4_ssse3( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-void x264_quant_8x8_ssse3( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_2x2_dc_mmxext( int16_t dct[2][2], int mf, int bias );
+int x264_quant_4x4_dc_mmxext( int16_t dct[4][4], int mf, int bias );
+int x264_quant_4x4_mmx( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_mmx( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_4x4_dc_sse2( int16_t dct[4][4], int mf, int bias );
+int x264_quant_4x4_sse2( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_sse2( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_2x2_dc_ssse3( int16_t dct[2][2], int mf, int bias );
+int x264_quant_4x4_dc_ssse3( int16_t dct[4][4], int mf, int bias );
+int x264_quant_4x4_ssse3( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_ssse3( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_4x4_dc_sse4( int16_t dct[4][4], int mf, int bias );
+int x264_quant_4x4_sse4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_sse4( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
  void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
  void x264_dequant_4x4dc_mmxext( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
  void x264_dequant_8x8_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
diff --git a/encoder/analyse.c b/encoder/analyse.c

index bd53ebfe487e505495960ce8d49787d9e7874f47..63a74ea9c601a0af09e26cdb96d4f3049f5ba28a 100644 (file)
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -665,6 +665,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
          x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
          int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
          int i_cost = 0;
+        h->mb.i_cbp_luma = 0;
          b_merged_satd = h->pixf.intra_sa8d_x3_8x8 && h->pixf.mbcmp[0] == h->pixf.satd[0];
  
          // FIXME some bias like in i4x4?
@@ -732,6 +733,11 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
              if( h->mb.i_skip_intra )
              {
                  h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
+                h->mb.pic.i8x8_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
+                h->mb.pic.i8x8_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
+                h->mb.pic.i8x8_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
+                h->mb.pic.i8x8_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
+                h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
                  if( h->mb.i_skip_intra == 2 )
                      h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
              }
@@ -751,6 +757,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
      {
          int i_cost;
          int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
+        h->mb.i_cbp_luma = 0;
          b_merged_satd = h->pixf.intra_satd_x3_4x4 && h->pixf.mbcmp[0] == h->pixf.satd[0];
          if( a->i_mbrd )
              i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
@@ -817,6 +824,11 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
              if( h->mb.i_skip_intra )
              {
                  h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
+                h->mb.pic.i4x4_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
+                h->mb.pic.i4x4_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
+                h->mb.pic.i4x4_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
+                h->mb.pic.i4x4_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
+                h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
                  if( h->mb.i_skip_intra == 2 )
                      h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
              }
@@ -1951,6 +1963,8 @@ static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
              x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
              x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
              x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
+            /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
+             * for future blocks are those left over from previous RDO calls. */
              for( i = 0; i < 4; i++ )
              {
                  int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
diff --git a/encoder/cabac.c b/encoder/cabac.c

index 4fa74033f52d5bff1fa8957ed3a2e85e07c5701f..2015da5e6fdf13dd029dc776905c77f581995c9d 100644 (file)
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -1142,20 +1142,10 @@ static void x264_subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, i
  static void x264_partition_i8x8_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int i_mode )
  {
      const int i_pred = x264_mb_predict_intra4x4_mode( h, 4*i8 );
-    const int nnz = array_non_zero(h->dct.luma8x8[i8]);
      i_mode = x264_mb_pred_mode4x4_fix( i_mode );
      x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
-    if( nnz )
-    {
-        *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4]] = 0x0101;
-        *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
+    if( h->mb.i_cbp_luma & (1 << i8) )
          block_residual_write_cabac_8x8( h, cb, 4*i8, h->dct.luma8x8[i8] );
-    }
-    else
-    {
-        *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4]] = 0;
-        *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
-    }
  }
  
  static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_mode )
@@ -1163,7 +1153,6 @@ static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4,
      const int i_pred = x264_mb_predict_intra4x4_mode( h, i4 );
      i_mode = x264_mb_pred_mode4x4_fix( i_mode );
      x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
-    h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
      block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
  }
  
diff --git a/encoder/cavlc.c b/encoder/cavlc.c

index 4f4ff03371a373a4ec934b56d2c980276f40fe21..e499fac598350df3161c24d56a348e9ca243b85d 100644 (file)
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -702,7 +702,6 @@ static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
  static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
  {
      h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, i4, i_mode );
-    h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
      block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
      return h->out.bs.i_bits_encoded;
  }
diff --git a/encoder/macroblock.c b/encoder/macroblock.c

index 30df7781810b9b6238cb4cda1f8cb359dfd81b58..6faa305e7c52cf3c6f510f473d1adce3b448106b 100644 (file)
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -84,26 +84,38 @@ static inline void dct2x2dc( int16_t d[2][2], int16_t dct4x4[4][4][4] )
      dct4x4[3][0][0] = 0;
  }
  
-static ALWAYS_INLINE void x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
+static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
  {
      int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY;
      if( h->mb.b_trellis )
-        x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, idx );
+        return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, idx );
      else
-        h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
+        return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
  }
  
-static ALWAYS_INLINE void x264_quant_8x8( x264_t *h, int16_t dct[8][8], int i_qp, int b_intra, int idx )
+static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, int16_t dct[8][8], int i_qp, int b_intra, int idx )
  {
      int i_quant_cat = b_intra ? CQM_8IY : CQM_8PY;
      if( h->mb.b_trellis )
-        x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx );
+        return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx );
      else
-        h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
+        return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
  }
  
+/* All encoding functions must output the correct CBP and NNZ values.
+ * The entropy coding functions will check CBP first, then NNZ, before
+ * actually reading the DCT coefficients.  NNZ still must be correct even
+ * if CBP is zero because of the use of NNZ values for context selection.
+ * "NNZ" need only be 0 or 1 rather than the exact coefficient count because
+ * that is only needed in CAVLC, and will be calculated by CAVLC's residual
+ * coding and stored as necessary. */
+
+/* This means that decimation can be done merely by adjusting the CBP and NNZ
+ * rather than memsetting the coefficients. */
+
  void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
  {
+    int nz;
      uint8_t *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
      uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
      DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
@@ -111,29 +123,36 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
      if( h->mb.b_lossless )
      {
          h->zigzagf.sub_4x4( h->dct.luma4x4[idx], p_src, p_dst );
+        nz = array_non_zero( h->dct.luma4x4[idx] );
+        h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
+        h->mb.i_cbp_luma |= nz<<(idx>>2);
          return;
      }
  
      h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
  
-    x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1, idx );
-
-    if( array_non_zero( dct4x4 ) )
+    nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1, idx );
+    h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
+    if( nz )
      {
+        h->mb.i_cbp_luma |= 1<<(idx>>2);
          h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 );
          h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qp );
-
-        /* output samples to fdec */
          h->dctf.add4x4_idct( p_dst, dct4x4 );
      }
-    else
-        memset( h->dct.luma4x4[idx], 0, sizeof(h->dct.luma4x4[idx]));
+}
+
+#define STORE_8x8_NNZ(idx,nz)\
+{\
+    *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] = nz * 0x0101;\
+    *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] = nz * 0x0101;\
  }
  
  void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
  {
      int x = 8 * (idx&1);
      int y = 8 * (idx>>1);
+    int nz;
      uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
      uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
      DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
@@ -141,16 +160,25 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
      if( h->mb.b_lossless )
      {
          h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
+        nz = array_non_zero( h->dct.luma8x8[idx] );
+        STORE_8x8_NNZ(idx,nz);
+        h->mb.i_cbp_luma |= nz<<idx;
          return;
      }
  
      h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
  
-    x264_quant_8x8( h, dct8x8, i_qp, 1, idx );
-
+    nz = x264_quant_8x8( h, dct8x8, i_qp, 1, idx );
      h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
-    h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
-    h->dctf.add8x8_idct8( p_dst, dct8x8 );
+    if( nz )
+    {
+        h->mb.i_cbp_luma |= 1<<idx;
+        h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
+        h->dctf.add8x8_idct8( p_dst, dct8x8 );
+        STORE_8x8_NNZ(idx,1);
+    }
+    else
+        STORE_8x8_NNZ(idx,0);
  }
  
  static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
@@ -161,7 +189,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
      DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
      DECLARE_ALIGNED_16( int16_t dct_dc4x4[4][4] );
  
-    int i;
+    int i, nz;
  
      if( h->mb.b_lossless )
      {
@@ -172,12 +200,18 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
              h->zigzagf.sub_4x4( h->dct.luma4x4[i], p_src+oe, p_dst+od );
              dct_dc4x4[0][block_idx_yx_1d[i]] = h->dct.luma4x4[i][0];
              h->dct.luma4x4[i][0] = 0;
+            nz = array_non_zero( h->dct.luma4x4[i] );
+            h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
+            h->mb.i_cbp_luma |= nz;
          }
+        h->mb.i_cbp_luma *= 0xf;
+        h->mb.cache.non_zero_count[x264_scan8[24]] = array_non_zero( dct_dc4x4 );
          h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
          return;
      }
  
      h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
+
      for( i = 0; i < 16; i++ )
      {
          /* copy dc coeff */
@@ -185,36 +219,45 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
          dct4x4[i][0][0] = 0;
  
          /* quant/scan/dequant */
-        x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i );
-
-        h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] );
-        h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp );
+        nz = x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i );
+        h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
+        if( nz )
+        {
+            h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] );
+            h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp );
+            h->mb.i_cbp_luma = 0xf;
+        }
      }
  
      h->dctf.dct4x4dc( dct_dc4x4 );
      if( h->mb.b_trellis )
-        x264_quant_dc_trellis( h, (int16_t*)dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1);
+        nz = x264_quant_dc_trellis( h, (int16_t*)dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1);
      else
-        h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 );
-    h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
+        nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 );
  
-    /* output samples to fdec */
-    h->dctf.idct4x4dc( dct_dc4x4 );
-    h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp );  /* XXX not inversed */
-
-    /* calculate dct coeffs */
-    for( i = 0; i < 16; i++ )
+    h->mb.cache.non_zero_count[x264_scan8[24]] = nz;
+    if( nz )
      {
-        /* copy dc coeff */
-        dct4x4[i][0][0] = dct_dc4x4[0][block_idx_xy_1d[i]];
+        h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
+
+        /* output samples to fdec */
+        h->dctf.idct4x4dc( dct_dc4x4 );
+        h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp );  /* XXX not inversed */
+        if( h->mb.i_cbp_luma )
+            for( i = 0; i < 16; i++ )
+                dct4x4[i][0][0] = dct_dc4x4[0][block_idx_xy_1d[i]];
      }
+
      /* put pixels to fdec */
-    h->dctf.add16x16_idct( p_dst, dct4x4 );
+    if( h->mb.i_cbp_luma )
+        h->dctf.add16x16_idct( p_dst, dct4x4 );
+    else if( nz )
+        h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
  }
  
  void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
  {
-    int i, ch, nz;
+    int i, ch, nz, nz_dc;
      int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
      h->mb.i_cbp_chroma = 0;
  
@@ -223,6 +266,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
          uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
          uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
          int i_decimate_score = 0;
+        int nz_ac = 0;
  
          DECLARE_ALIGNED_16( int16_t dct2x2[2][2]  );
          DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
@@ -250,52 +294,49 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
          for( i = 0; i < 4; i++ )
          {
              if( h->mb.b_trellis )
-                x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 0 );
+                nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 0 );
              else
-                h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
-            h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] );
-
-            if( b_decimate )
-                i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*4] );
+                nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
+            h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
+            if( nz )
+            {
+                nz_ac = 1;
+                h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] );
+                h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+                if( b_decimate )
+                    i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*4] );
+            }
          }
  
          if( h->mb.b_trellis )
-            x264_quant_dc_trellis( h, (int16_t*)dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter );
+            nz_dc = x264_quant_dc_trellis( h, (int16_t*)dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter );
          else
-            h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
+            nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
  
-        if( b_decimate && i_decimate_score < 7 )
+        h->mb.cache.non_zero_count[x264_scan8[25]+ch] = nz_dc;
+
+        if( (b_decimate && i_decimate_score < 7) || !nz_ac )
          {
              /* Decimate the block */
              h->mb.cache.non_zero_count[x264_scan8[16+0]+24*ch] = 0;
              h->mb.cache.non_zero_count[x264_scan8[16+1]+24*ch] = 0;
              h->mb.cache.non_zero_count[x264_scan8[16+2]+24*ch] = 0;
              h->mb.cache.non_zero_count[x264_scan8[16+3]+24*ch] = 0;
-            if( !array_non_zero( dct2x2 ) ) /* Whole block is empty */
-            {
-                h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 0;
+            if( !nz_dc ) /* Whole block is empty */
                  continue;
-            }
              /* DC-only */
-            h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1;
              zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
              idct_dequant_2x2_dconly( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
              h->dctf.add8x8_idct_dc( p_dst, dct2x2 );
          }
          else
          {
-            for( i = 0; i < 4; i++ )
+            h->mb.i_cbp_chroma = 1;
+            if( nz_dc )
              {
-                nz = array_non_zero( h->dct.luma4x4[16+ch*4+i] );
-                h->mb.cache.non_zero_count[x264_scan8[16+ch*4+i]] = nz;
-                h->mb.i_cbp_chroma |= nz;
-                if( nz )
-                    h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+                zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
+                idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
              }
-            /* Don't optimize for the AC-only case--it's very rare */
-            h->mb.cache.non_zero_count[x264_scan8[25]+ch] = array_non_zero( dct2x2 );
-            zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
-            idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
              h->dctf.add8x8_idct( p_dst, dct4x4 );
          }
      }
@@ -423,8 +464,9 @@ void x264_macroblock_encode( x264_t *h )
      int i_qp = h->mb.i_qp;
      int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
      int b_force_no_skip = 0;
-    int i,j,idx;
-    uint8_t nnz8x8[4] = {1,1,1,1};
+    int i,idx,nz;
+    h->mb.i_cbp_luma = 0;
+    h->mb.cache.non_zero_count[x264_scan8[24]] = 0;
  
      if( h->sh.b_mbaff
          && h->mb.i_mb_xy == h->sh.i_first_mb + h->mb.i_mb_stride
@@ -479,6 +521,11 @@ void x264_macroblock_encode( x264_t *h )
          if( h->mb.i_skip_intra )
          {
              h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
+            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i8x8_nnz_buf[0];
+            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i8x8_nnz_buf[1];
+            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i8x8_nnz_buf[2];
+            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i8x8_nnz_buf[3];
+            h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
              /* In RD mode, restore the now-overwritten DCT data. */
              if( h->mb.i_skip_intra == 2 )
                  h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
@@ -496,8 +543,6 @@ void x264_macroblock_encode( x264_t *h )
  
              x264_mb_encode_i8x8( h, i, i_qp );
          }
-        for( i = 0; i < 4; i++ )
-            nnz8x8[i] = array_non_zero( h->dct.luma8x8[i] );
      }
      else if( h->mb.i_type == I_4x4 )
      {
@@ -506,6 +551,11 @@ void x264_macroblock_encode( x264_t *h )
          if( h->mb.i_skip_intra )
          {
              h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
+            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i4x4_nnz_buf[0];
+            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i4x4_nnz_buf[1];
+            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i4x4_nnz_buf[2];
+            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i4x4_nnz_buf[3];
+            h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
              /* In RD mode, restore the now-overwritten DCT data. */
              if( h->mb.i_skip_intra == 2 )
                  h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
@@ -545,7 +595,9 @@ void x264_macroblock_encode( x264_t *h )
                      h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
                                          h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
                                          h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
-                    nnz8x8[i8x8] = array_non_zero( h->dct.luma8x8[i8x8] );
+                    nz = array_non_zero( h->dct.luma8x8[i8x8] );
+                    STORE_8x8_NNZ(i8x8,nz);
+                    h->mb.i_cbp_luma |= nz << i8x8;
                  }
              else
                  for( i4x4 = 0; i4x4 < 16; i4x4++ )
@@ -553,6 +605,9 @@ void x264_macroblock_encode( x264_t *h )
                      h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
                                          h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4],
                                          h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] );
+                    nz = array_non_zero( h->dct.luma4x4[i4x4] );
+                    h->mb.cache.non_zero_count[x264_scan8[i4x4]] = nz;
+                    h->mb.i_cbp_luma |= nz << (i4x4>>2);
                  }
          }
          else if( h->mb.b_transform_8x8 )
@@ -566,31 +621,44 @@ void x264_macroblock_encode( x264_t *h )
              {
                  if( h->mb.b_noise_reduction )
                      h->quantf.denoise_dct( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );
-                x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx );
+                nz = x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx );
  
-                h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] );
-
-                if( b_decimate )
+                if( nz )
                  {
-                    int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[idx] );
-                    i_decimate_mb += i_decimate_8x8;
-                    if( i_decimate_8x8 < 4 )
-                        nnz8x8[idx] = 0;
+                    h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] );
+                    if( b_decimate )
+                    {
+                        int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[idx] );
+                        i_decimate_mb += i_decimate_8x8;
+                        if( i_decimate_8x8 >= 4 )
+                            h->mb.i_cbp_luma |= 1<<idx;
+                    }
+                    else
+                        h->mb.i_cbp_luma |= 1<<idx;
                  }
-                else
-                    nnz8x8[idx] = array_non_zero( dct8x8[idx] );
              }
  
              if( i_decimate_mb < 6 && b_decimate )
-                *(uint32_t*)nnz8x8 = 0;
+            {
+                h->mb.i_cbp_luma = 0;
+                *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
+                *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
+                *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
+                *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
+            }
              else
              {
                  for( idx = 0; idx < 4; idx++ )
-                    if( nnz8x8[idx] )
+                {
+                    if( h->mb.i_cbp_luma&(1<<idx) )
                      {
                          h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
                          h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] );
+                        STORE_8x8_NNZ(idx,1);
                      }
+                    else
+                        STORE_8x8_NNZ(idx,0);
+                }
              }
          }
          else
@@ -601,41 +669,61 @@ void x264_macroblock_encode( x264_t *h )
  
              for( i8x8 = 0; i8x8 < 4; i8x8++ )
              {
-                int i_decimate_8x8;
+                int i_decimate_8x8 = 0;
+                int cbp = 0;
  
                  /* encode one 4x4 block */
-                i_decimate_8x8 = 0;
                  for( i4x4 = 0; i4x4 < 4; i4x4++ )
                  {
                      idx = i8x8 * 4 + i4x4;
  
                      if( h->mb.b_noise_reduction )
                          h->quantf.denoise_dct( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
-                    x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx );
+                    nz = x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx );
+                    h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
  
-                    h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
-
-                    if( b_decimate && i_decimate_8x8 < 6 )
-                        i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[idx] );
+                    if( nz )
+                    {
+                        h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
+                        h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[CQM_4PY], i_qp );
+                        if( b_decimate && i_decimate_8x8 < 6 )
+                            i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[idx] );
+                        cbp = 1;
+                    }
                  }
  
                  /* decimate this 8x8 block */
                  i_decimate_mb += i_decimate_8x8;
-                if( i_decimate_8x8 < 4 && b_decimate )
-                    nnz8x8[i8x8] = 0;
+                if( b_decimate )
+                {
+                    if( i_decimate_8x8 < 4 )
+                        STORE_8x8_NNZ(i8x8,0)
+                    else
+                        h->mb.i_cbp_luma |= 1<<i8x8;
+                }
+                else if( cbp )
+                {
+                    h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
+                    h->mb.i_cbp_luma |= 1<<i8x8;
+                }
              }
  
-            if( i_decimate_mb < 6 && b_decimate )
-                *(uint32_t*)nnz8x8 = 0;
-            else
+            if( b_decimate )
              {
-                for( i8x8 = 0; i8x8 < 4; i8x8++ )
-                    if( nnz8x8[i8x8] )
-                    {
-                        for( i = 0; i < 4; i++ )
-                            h->quantf.dequant_4x4( dct4x4[i8x8*4+i], h->dequant4_mf[CQM_4PY], i_qp );
-                        h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
-                    }
+                if( i_decimate_mb < 6 )
+                {
+                    h->mb.i_cbp_luma = 0;
+                    *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
+                    *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
+                    *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
+                    *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
+                }
+                else
+                {
+                    for( i8x8 = 0; i8x8 < 4; i8x8++ )
+                        if( h->mb.i_cbp_luma&(1<<i8x8) )
+                            h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
+                }
              }
          }
      }
@@ -656,49 +744,6 @@ void x264_macroblock_encode( x264_t *h )
      /* encode the 8x8 blocks */
      x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
  
-    /* coded block pattern and non_zero_count */
-    h->mb.i_cbp_luma = 0x00;
-    if( h->mb.i_type == I_16x16 )
-    {
-        for( i = 0; i < 16; i++ )
-        {
-            int nz = array_non_zero( h->dct.luma4x4[i] );
-            h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
-            h->mb.i_cbp_luma |= nz;
-        }
-        h->mb.i_cbp_luma *= 0xf;
-        h->mb.cache.non_zero_count[x264_scan8[24]] = array_non_zero( h->dct.luma16x16_dc );
-    }
-    else
-    {
-        for( i = 0; i < 4; i++)
-        {
-            if(!nnz8x8[i])
-            {
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[0+i*4]] = 0;
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[2+i*4]] = 0;
-            }
-            else if( h->mb.b_transform_8x8 )
-            {
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[0+4*i]] = nnz8x8[i] * 0x0101;
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[2+4*i]] = nnz8x8[i] * 0x0101;
-                h->mb.i_cbp_luma |= nnz8x8[i] << i;
-            }
-            else
-            {
-                int nz, cbp = 0;
-                for( j = 0; j < 4; j++ )
-                {
-                    nz = array_non_zero( h->dct.luma4x4[j+4*i] );
-                    h->mb.cache.non_zero_count[x264_scan8[j+4*i]] = nz;
-                    cbp |= nz;
-                }
-                h->mb.i_cbp_luma |= cbp << i;
-            }
-        }
-        h->mb.cache.non_zero_count[x264_scan8[24]] = 0;
-    }
-
      if( h->param.b_cabac )
      {
          i_cbp_dc = h->mb.cache.non_zero_count[x264_scan8[24]]
@@ -770,8 +815,7 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
          /* encode one 4x4 block */
          for( i4x4 = 0; i4x4 < 4; i4x4++ )
          {
-            h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
-            if( !array_non_zero(dct4x4[i4x4]) )
+            if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ) )
                  continue;
              h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
              i_decimate_mb += h->quantf.decimate_score16( dctscan );
@@ -805,15 +849,13 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
  
          /* calculate dct DC */
          dct2x2dc( dct2x2, dct4x4 );
-        h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 );
-        if( array_non_zero(dct2x2) )
+        if( h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ) )
              return 0;
  
          /* calculate dct coeffs */
          for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
          {
-            h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
-            if( !array_non_zero(dct4x4[i4x4]) )
+            if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
                  continue;
              h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
              i_decimate_mb += h->quantf.decimate_score15( dctscan );
@@ -865,7 +907,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
      uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
      int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
      int nnz8x8 = 0;
-    int ch;
+    int ch, nz;
  
      x264_mb_mc_8x8( h, i8 );
  
@@ -876,8 +918,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
          {
              h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
              nnz8x8 = array_non_zero( h->dct.luma8x8[i8] );
-            *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101 * nnz8x8;
-            *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101 * nnz8x8;
+            STORE_8x8_NNZ(i8,nnz8x8);
          }
          else
          {
@@ -898,9 +939,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
              p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
              h->zigzagf.sub_4x4( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec );
              h->dct.luma4x4[16+i8+ch*4][0] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = array_non_zero( h->dct.luma4x4[16+i8+ch*4] );
          }
-        h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero( h->dct.luma4x4[16+i8] );
-        h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero( h->dct.luma4x4[20+i8] );
      }
      else
      {
@@ -908,67 +948,53 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
          {
              DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
              h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
-            x264_quant_8x8( h, dct8x8, i_qp, 0, i8 );
-            h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
-
-            if( b_decimate && !h->mb.b_trellis )
-                nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[i8] );
-            else
-                nnz8x8 = array_non_zero( dct8x8 );
-
+            nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, 0, i8 );
              if( nnz8x8 )
              {
-                h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
-                h->dctf.add8x8_idct8( p_fdec, dct8x8 );
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101;
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
+                h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
+
+                if( b_decimate && !h->mb.b_trellis )
+                    nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[i8] );
+
+                if( nnz8x8 )
+                {
+                    h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
+                    h->dctf.add8x8_idct8( p_fdec, dct8x8 );
+                    STORE_8x8_NNZ(i8,1);
+                }
+                else
+                    STORE_8x8_NNZ(i8,0);
              }
              else
-            {
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0;
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
-            }
+                STORE_8x8_NNZ(i8,0);
          }
          else
          {
              int i4;
+            int i_decimate_8x8 = 0;
              DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
              h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
              for( i4 = 0; i4 < 4; i4++ )
-                x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0, i8*4+i4 );
-
-            for( i4 = 0; i4 < 4; i4++ )
-                h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
-
-            if( b_decimate )
              {
-                int i_decimate_8x8 = 0;
-                for( i4 = 0; i4 < 4 && i_decimate_8x8 < 4; i4++ )
-                    i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[i8*4+i4] );
-                nnz8x8 = 4 <= i_decimate_8x8;
+                nz = x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0, i8*4+i4 );
+                h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = nz;
+                if( nz )
+                {
+                    h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
+                    h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
+                    if( b_decimate )
+                        i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[i8*4+i4] );
+                    nnz8x8 = 1;
+                }
              }
-            else
-                nnz8x8 = array_non_zero( dct4x4 );
+
+            if( b_decimate && i_decimate_8x8 < 4 )
+                nnz8x8 = 0;
  
              if( nnz8x8 )
-            {
-                for( i4 = 0; i4 < 4; i4++ )
-                {
-                    if( array_non_zero( dct4x4[i4] ) )
-                    {
-                        h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
-                        h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = 1;
-                    }
-                    else
-                        h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = 0;
-                }
                  h->dctf.add8x8_idct( p_fdec, dct4x4 );
-            }
              else
-            {
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0;
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
-            }
+                STORE_8x8_NNZ(i8,0);
          }
  
          i_qp = h->mb.i_chroma_qp;
@@ -983,19 +1009,17 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
              dct4x4[0][0] = 0;
  
              if( h->mb.b_trellis )
-                x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 0 );
+                nz = x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 0 );
              else
-                h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
+                nz = h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
  
-            if( array_non_zero( dct4x4 ) )
+            h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
+            if( nz )
              {
                  h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*4], dct4x4 );
                  h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
                  h->dctf.add4x4_idct( p_fdec, dct4x4 );
-                h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = 1;
              }
-            else
-                h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = 0;
          }
      }
      h->mb.i_cbp_luma &= ~(1 << i8);
@@ -1014,6 +1038,7 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
      const int i_ref = h->mb.cache.ref[0][x264_scan8[i4]];
      const int mvx   = x264_clip3( h->mb.cache.mv[0][x264_scan8[i4]][0], h->mb.mv_min[0], h->mb.mv_max[0] );
      const int mvy   = x264_clip3( h->mb.cache.mv[0][x264_scan8[i4]][1], h->mb.mv_min[1], h->mb.mv_max[1] );
+    int nz;
  
      h->mc.mc_luma( p_fdec, FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0], mvx + 4*4*block_idx_x[i4], mvy + 4*4*block_idx_y[i4], 4, 4 );
  
@@ -1026,15 +1051,13 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
      {
          DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
          h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
-        x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 0, i4 );
-        if( array_non_zero( dct4x4 ) )
+        nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 0, i4 );
+        h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
+        if( nz )
          {
              h->zigzagf.scan_4x4( h->dct.luma4x4[i4], dct4x4 );
              h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PY], i_qp );
              h->dctf.add4x4_idct( p_fdec, dct4x4 );
-            h->mb.cache.non_zero_count[x264_scan8[i4]] = 1;
          }
-        else
-            h->mb.cache.non_zero_count[x264_scan8[i4]] = 0;
      }
  }
diff --git a/encoder/macroblock.h b/encoder/macroblock.h

index 4cc599aa37cba017f66b4326923ab7af92bfe2f4..7b9f08a3ad21c00d51917d5763be9d1b459a9f7a 100644 (file)
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -55,11 +55,11 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp );
  
  void x264_cabac_mb_skip( x264_t *h, int b_skip );
  
-void x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
+int x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
                               int i_qp, int i_ctxBlockCat, int b_intra );
-void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
+int x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
                               int i_qp, int i_ctxBlockCat, int b_intra, int idx );
-void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
+int x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
                               int i_qp, int b_intra, int idx );
  
  void x264_noise_reduction_update( x264_t *h );
diff --git a/encoder/rdo.c b/encoder/rdo.c

index 1ba2a715220ca4eeceb86d0903246310c6251765..76cfdcaf0cd6aa7239699e9a3636878de84d9a57 100644 (file)
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -215,6 +215,8 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
      if( i_pixel > PIXEL_8x8 )
          return x264_rd_cost_subpart( h, i_lambda2, i4, i_pixel );
  
+    h->mb.i_cbp_luma = 0;
+
      x264_macroblock_encode_p8x8( h, i8 );
      if( i_pixel == PIXEL_16x8 )
          x264_macroblock_encode_p8x8( h, i8+1 );
@@ -243,6 +245,8 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
  static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
  {
      uint64_t i_ssd, i_bits;
+    h->mb.i_cbp_luma = 0;
+    h->mb.b_transform_8x8 = 1;
  
      x264_mb_encode_i8x8( h, i8, h->mb.i_qp );
      i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 );
@@ -404,7 +408,7 @@ typedef struct {
  // comparable to the input. so unquant is the direct inverse of quant,
  // and uses the dct scaling factors, not the idct ones.
  
-static ALWAYS_INLINE void quant_trellis_cabac( x264_t *h, int16_t *dct,
+static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, int16_t *dct,
                                   const uint16_t *quant_mf, const int *unquant_mf,
                                   const int *coef_weight, const uint8_t *zigzag,
                                   int i_ctxBlockCat, int i_lambda2, int b_ac, int dc, int i_coefs, int idx )
@@ -419,7 +423,7 @@ static ALWAYS_INLINE void quant_trellis_cabac( x264_t *h, int16_t *dct,
      const int b_interlaced = h->mb.b_interlaced;
      const int f = 1 << 15; // no deadzone
      int i_last_nnz;
-    int i, j;
+    int i, j, nz;
  
      // (# of coefs) * (# of ctx) * (# of levels tried) = 1024
      // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough,
@@ -438,7 +442,7 @@ static ALWAYS_INLINE void quant_trellis_cabac( x264_t *h, int16_t *dct,
      if( i < b_ac )
      {
          memset( dct, 0, i_coefs * sizeof(*dct) );
-        return;
+        return 0;
      }
  
      i_last_nnz = i;
@@ -613,39 +617,42 @@ static ALWAYS_INLINE void quant_trellis_cabac( x264_t *h, int16_t *dct,
              bnode = &nodes_cur[j];
  
      j = bnode->level_idx;
+    nz = 0;
      for( i = b_ac; i < i_coefs; i++ )
      {
          dct[zigzag[i]] = level_tree[j].abs_level * signs[i];
+        nz |= level_tree[j].abs_level;
          j = level_tree[j].next;
      }
+    return !!nz;
  }
  
  const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3};
  
-void x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
+int x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
                              int i_qp, int i_ctxBlockCat, int b_intra )
  {
-    quant_trellis_cabac( h, (int16_t*)dct,
+    return quant_trellis_cabac( h, (int16_t*)dct,
          h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
          NULL, i_ctxBlockCat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced],
          i_ctxBlockCat, lambda2_tab[b_intra][i_qp], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 );
  }
  
-void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
+int x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
                               int i_qp, int i_ctxBlockCat, int b_intra, int idx )
  {
      int b_ac = (i_ctxBlockCat == DCT_LUMA_AC || i_ctxBlockCat == DCT_CHROMA_AC);
-    quant_trellis_cabac( h, (int16_t*)dct,
+    return quant_trellis_cabac( h, (int16_t*)dct,
          h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
          x264_dct4_weight2_zigzag[h->mb.b_interlaced],
          x264_zigzag_scan4[h->mb.b_interlaced],
          i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 0, 16, idx );
  }
  
-void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
+int x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
                               int i_qp, int b_intra, int idx )
  {
-    quant_trellis_cabac( h, (int16_t*)dct,
+    return quant_trellis_cabac( h, (int16_t*)dct,
          h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
          x264_dct8_weight2_zigzag[h->mb.b_interlaced],
          x264_zigzag_scan8[h->mb.b_interlaced],
diff --git a/tools/checkasm.c b/tools/checkasm.c

index 9bc802ad7943a889aa2e972a472c04a050fdfec1..3f89e6815212fc0fc20953529797294a562024ed 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -558,6 +558,7 @@ static int check_dct( int cpu_ref, int cpu_new )
      TEST_IDCT( add8x8_idct, dct4 );
      TEST_IDCT( add8x8_idct_dc, dct4 );
      TEST_IDCT( add16x16_idct, dct4 );
+    TEST_IDCT( add16x16_idct_dc, dct4 );
      report( "add_idct4 :" );
  
      ok = 1; used_asm = 0;
@@ -958,7 +959,7 @@ static int check_quant( int cpu_ref, int cpu_new )
      DECLARE_ALIGNED_16( uint8_t cqm_buf[64] );
      int ret = 0, ok, used_asm;
      int oks[2] = {1,1}, used_asms[2] = {0,0};
-    int i, i_cqm, qp;
+    int i, j, i_cqm, qp;
      x264_t h_buf;
      x264_t *h = &h_buf;
      memset( h, 0, sizeof(*h) );
@@ -1007,7 +1008,7 @@ static int check_quant( int cpu_ref, int cpu_new )
                  for( x = 0; x < 8; x++ ) \
                  { \
                      unsigned int scale = (255*scale1d[y]*scale1d[x])/16; \
-                    dct1[y*8+x] = dct2[y*8+x] = (rand()%(2*scale+1))-scale; \
+                    dct1[y*8+x] = dct2[y*8+x] = j ? (rand()%(2*scale+1))-scale : 0; \
                  } \
          }
  
@@ -1019,7 +1020,7 @@ static int check_quant( int cpu_ref, int cpu_new )
                  for( x = 0; x < 4; x++ ) \
                  { \
                      unsigned int scale = 255*scale1d[y]*scale1d[x]; \
-                    dct1[y*4+x] = dct2[y*4+x] = (rand()%(2*scale+1))-scale; \
+                    dct1[y*4+x] = dct2[y*4+x] = j ? (rand()%(2*scale+1))-scale : 0; \
                  } \
          }
  
@@ -1030,18 +1031,22 @@ static int check_quant( int cpu_ref, int cpu_new )
              used_asms[0] = 1; \
              for( qp = 51; qp > 0; qp-- ) \
              { \
-                for( i = 0; i < 16; i++ ) \
-                    dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \
-                call_c1( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
-                call_a1( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
-                if( memcmp( dct1, dct2, 16*2 ) )       \
+                for( j = 0; j < 2; j++ ) \
                  { \
-                    oks[0] = 0; \
-                    fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
-                    break; \
+                    int result_c, result_a; \
+                    for( i = 0; i < 16; i++ ) \
+                        dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \
+                    result_c = call_c1( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+                    result_a = call_a1( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+                    if( memcmp( dct1, dct2, 16*2 ) || result_c != result_a )       \
+                    { \
+                        oks[0] = 0; \
+                        fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
+                        break; \
+                    } \
+                    call_c2( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+                    call_a2( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
                  } \
-                call_c2( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
-                call_a2( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
              } \
          }
  
@@ -1052,17 +1057,21 @@ static int check_quant( int cpu_ref, int cpu_new )
              used_asms[0] = 1; \
              for( qp = 51; qp > 0; qp-- ) \
              { \
-                INIT_QUANT##w() \
-                call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
-                call_a1( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
-                if( memcmp( dct1, dct2, w*w*2 ) ) \
+                for( j = 0; j < 2; j++ ) \
                  { \
-                    oks[0] = 0; \
-                    fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
-                    break; \
+                    int result_c, result_a; \
+                    INIT_QUANT##w() \
+                    result_c = call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+                    result_a = call_a1( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+                    if( memcmp( dct1, dct2, w*w*2 ) || result_c != result_a ) \
+                    { \
+                        oks[0] = 0; \
+                        fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
+                        break; \
+                    } \
+                    call_c2( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+                    call_a2( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
                  } \
-                call_c2( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
-                call_a2( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
              } \
          }
  
@@ -1078,6 +1087,7 @@ static int check_quant( int cpu_ref, int cpu_new )
          { \
              set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
              used_asms[1] = 1; \
+            j = 1; \
              for( qp = 51; qp > 0; qp-- ) \
              { \
                  INIT_QUANT##w() \
author	Fiona Glaser <fiona@x264.com>
	Fri, 30 Jan 2009 11:40:54 +0000 (03:40 -0800)
committer	Fiona Glaser <fiona@x264.com>
	Fri, 30 Jan 2009 12:11:24 +0000 (04:11 -0800)
common/common.h		patch \| blob \| history
common/dct.c		patch \| blob \| history
common/dct.h		patch \| blob \| history
common/ppc/quant.c		patch \| blob \| history
common/ppc/quant.h		patch \| blob \| history
common/quant.c		patch \| blob \| history
common/quant.h		patch \| blob \| history
common/x86/dct-a.asm		patch \| blob \| history
common/x86/dct.h		patch \| blob \| history
common/x86/quant-a.asm		patch \| blob \| history
common/x86/quant.h		patch \| blob \| history
encoder/analyse.c		patch \| blob \| history
encoder/cabac.c		patch \| blob \| history
encoder/cavlc.c		patch \| blob \| history
encoder/macroblock.c		patch \| blob \| history
encoder/macroblock.h		patch \| blob \| history
encoder/rdo.c		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history