]> git.sesse.net Git - x264/commitdiff
quant_4x4x4: quant one 8x8 block at a time
authorFiona Glaser <fiona@x264.com>
Fri, 8 Feb 2013 23:34:38 +0000 (15:34 -0800)
committerFiona Glaser <fiona@x264.com>
Tue, 26 Feb 2013 07:22:56 +0000 (23:22 -0800)
This reduces overhead and lets us use less branchy code for zigzag, dequant,
decimate, and so on.
Reorganize and optimize a lot of macroblock_encode using this new function.
~1-2% faster overall.

Includes NEON and x86 versions of the new function.
Using larger merged functions like this will also make wider SIMD, like
AVX2, more effective.

common/arm/quant-a.S
common/arm/quant.h
common/osdep.h
common/quant.c
common/quant.h
common/x86/quant-a.asm
common/x86/quant.h
encoder/macroblock.c
encoder/macroblock.h
encoder/rdo.c
tools/checkasm.c

index 0cc865768eff4344a06539cae7b355ddf9738d8b..a265fc7f5504ee8a5346e679b9801ec81a37a68e 100644 (file)
@@ -35,7 +35,7 @@ pmovmskb_byte:
 
 .text
 
-.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 load_mf=no
+.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
     vadd.u16    q8,  q8,  \bias0
     vadd.u16    q9,  q9,  \bias1
 .ifc \load_mf, yes
@@ -55,7 +55,7 @@ pmovmskb_byte:
     veor        q9,  q9,  q15
     vsub.s16    q8,  q8,  q14
     vsub.s16    q9,  q9,  q15
-    vorr        \bias0, q8,  q9
+    vorr        \mask, q8,  q9
     vst1.64     {d16-d19}, [r0,:128]!
 .endm
 
@@ -89,7 +89,7 @@ function x264_quant_4x4_dc_neon
     vabs.s16    q9,  q15
     vdup.16     q0,  r2
     vdup.16     q2,  r1
-    QUANT_TWO   q0,  q0,  d4,  d5,  d4,  d5
+    QUANT_TWO   q0,  q0,  d4,  d5,  d4,  d5,  q0
     vorr        d0,  d0,  d1
     QUANT_END   d0
 .endfunc
@@ -101,11 +101,50 @@ function x264_quant_4x4_neon
     vabs.s16    q9,  q15
     vld1.64     {d0-d3}, [r2,:128]
     vld1.64     {d4-d7}, [r1,:128]
-    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7, q0
     vorr        d0,  d0,  d1
     QUANT_END   d0
 .endfunc
 
+// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
+function x264_quant_4x4x4_neon
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    vld1.64     {d0-d3},   [r2,:128]
+    vld1.64     {d4-d7},   [r1,:128]
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q4
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q5
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q6
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q7
+    vorr        d8,  d8,  d9
+    vorr       d10, d10, d11
+    vorr       d12, d12, d13
+    vorr       d14, d14, d15
+    vmov        r0,  r1,  d8
+    vmov        r2,  r3, d10
+    orrs        r0,  r1
+    movne       r0,  #1
+    orrs        r2,  r3
+    orrne       r0,  #2
+    vmov        r1,  r2, d12
+    vmov        r3,  ip, d14
+    orrs        r1,  r2
+    orrne       r0,  #4
+    orrs        r3,  ip
+    orrne       r0,  #8
+    bx          lr
+.endfunc
+
 // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
 function x264_quant_8x8_neon
     vld1.64     {d28-d31}, [r0,:128]
@@ -113,13 +152,13 @@ function x264_quant_8x8_neon
     vabs.s16    q9,  q15
     vld1.64     {d0-d3},   [r2,:128]!
     vld1.64     {d4-d7},   [r1,:128]!
-    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7,  q0
 .rept 3
     vld1.64     {d28-d31}, [r0,:128]
     vabs.s16    q8,  q14
     vabs.s16    q9,  q15
     vld1.64     {d2-d5},   [r2,:128]!
-    QUANT_TWO   q1,  q2,  d4,  d5,  d6,  d7, yes
+    QUANT_TWO   q1,  q2,  d4,  d5,  d6,  d7,  q1, yes
     vorr        q0,  q0,  q1
 .endr
     vorr        d0,  d0,  d1
index a9e084b7a78356982b86116e70099bf7369b3170..bf29f251b4f2b33ab011323f770c9937684cea17 100644 (file)
@@ -31,6 +31,7 @@ int x264_quant_2x2_dc_armv6( int16_t dct[4], int mf, int bias );
 int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias );
 int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias );
 int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] );
 int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
 
 void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
index 27cf5e9a506c858f23e0a83da272ef322a71c78d..1e17c6a176c45d68cc7911d2e51b5dd57f5a681b 100644 (file)
@@ -254,6 +254,13 @@ static ALWAYS_INLINE uint16_t endian_fix16( uint16_t x )
 }
 #endif
 
+/* For values with 4 bits or less. */
+static int ALWAYS_INLINE x264_ctz_4bit( uint32_t x )
+{
+    static uint8_t lut[16] = {4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0};
+    return lut[x];
+}
+
 #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 3)
 #define x264_clz(x) __builtin_clz(x)
 #define x264_ctz(x) __builtin_ctz(x)
index 9d0b42577a98f8bdace8f4072b8aa53550bba866..f4822ac22422038750288aa8405344154ba856c9 100644 (file)
@@ -63,6 +63,19 @@ static int quant_4x4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] )
     return !!nz;
 }
 
+static int quant_4x4x4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] )
+{
+    int nza = 0;
+    for( int j = 0; j < 4; j++ )
+    {
+        int nz = 0;
+        for( int i = 0; i < 16; i++ )
+            QUANT_ONE( dct[j][i], mf[i], bias[i] );
+        nza |= (!!nz)<<j;
+    }
+    return nza;
+}
+
 static int quant_4x4_dc( dctcoef dct[16], int mf, int bias )
 {
     int nz = 0;
@@ -405,6 +418,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
 {
     pf->quant_8x8 = quant_8x8;
     pf->quant_4x4 = quant_4x4;
+    pf->quant_4x4x4 = quant_4x4x4;
     pf->quant_4x4_dc = quant_4x4_dc;
     pf->quant_2x2_dc = quant_2x2_dc;
 
@@ -464,6 +478,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
     if( cpu&X264_CPU_SSE2 )
     {
         pf->quant_4x4 = x264_quant_4x4_sse2;
+        pf->quant_4x4x4 = x264_quant_4x4x4_sse2;
         pf->quant_8x8 = x264_quant_8x8_sse2;
         pf->quant_2x2_dc = x264_quant_2x2_dc_sse2;
         pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
@@ -501,6 +516,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
     if( cpu&X264_CPU_SSSE3 )
     {
         pf->quant_4x4 = x264_quant_4x4_ssse3;
+        pf->quant_4x4x4 = x264_quant_4x4x4_ssse3;
         pf->quant_8x8 = x264_quant_8x8_ssse3;
         pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
         pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
@@ -520,6 +536,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_2x2_dc = x264_quant_2x2_dc_sse4;
         pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
         pf->quant_4x4 = x264_quant_4x4_sse4;
+        pf->quant_4x4x4 = x264_quant_4x4x4_sse4;
         pf->quant_8x8 = x264_quant_8x8_sse4;
     }
     if( cpu&X264_CPU_AVX )
@@ -543,6 +560,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
     {
 #if ARCH_X86
         pf->quant_4x4 = x264_quant_4x4_mmx;
+        pf->quant_4x4x4 = x264_quant_4x4x4_mmx;
         pf->quant_8x8 = x264_quant_8x8_mmx;
         pf->dequant_4x4 = x264_dequant_4x4_mmx;
         pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2;
@@ -592,6 +610,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
     {
         pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
         pf->quant_4x4 = x264_quant_4x4_sse2;
+        pf->quant_4x4x4 = x264_quant_4x4x4_sse2;
         pf->quant_8x8 = x264_quant_8x8_sse2;
         pf->dequant_4x4 = x264_dequant_4x4_sse2;
         pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
@@ -631,6 +650,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
         pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
         pf->quant_4x4 = x264_quant_4x4_ssse3;
+        pf->quant_4x4x4 = x264_quant_4x4x4_ssse3;
         pf->quant_8x8 = x264_quant_8x8_ssse3;
         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_ssse3;
         pf->denoise_dct = x264_denoise_dct_ssse3;
@@ -696,6 +716,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_2x2_dc   = x264_quant_2x2_dc_neon;
         pf->quant_4x4      = x264_quant_4x4_neon;
         pf->quant_4x4_dc   = x264_quant_4x4_dc_neon;
+        pf->quant_4x4x4    = x264_quant_4x4x4_neon;
         pf->quant_8x8      = x264_quant_8x8_neon;
         pf->dequant_4x4    = x264_dequant_4x4_neon;
         pf->dequant_4x4_dc = x264_dequant_4x4_dc_neon;
index 7056b5b8428afa3f1c6b27e7f1e1de7f550d19cb..ee80aeb67822794c9b664ca3d681347c8f0f6ac4 100644 (file)
@@ -29,8 +29,9 @@
 
 typedef struct
 {
-    int (*quant_8x8)( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
-    int (*quant_4x4)( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+    int (*quant_8x8)  ( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
+    int (*quant_4x4)  ( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+    int (*quant_4x4x4)( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
     int (*quant_4x4_dc)( dctcoef dct[16], int mf, int bias );
     int (*quant_2x2_dc)( dctcoef dct[4], int mf, int bias );
 
index c1887d1e4366c7d18a27226506adb4a44ea01014..56e2f80d294b8906b2f3a50b231265a605c0bd39 100644 (file)
@@ -175,7 +175,7 @@ cextern pd_1024
 %endif ; cpuflag
 %endmacro
 
-%macro QUANT_ONE_AC_MMX 4
+%macro QUANT_ONE_AC_MMX 5
     mova        m0, [%1]
     mova        m2, [%2]
     ABSD        m1, m0
@@ -191,10 +191,10 @@ cextern pd_1024
     psrad       m1, 16
     PSIGND      m1, m0
     mova      [%1], m1
-    ACCUM      por, 5, 1, %4
+    ACCUM      por, %5, 1, %4
 %endmacro
 
-%macro QUANT_TWO_AC 4
+%macro QUANT_TWO_AC 5
 %if cpuflag(sse4)
     mova        m0, [%1       ]
     mova        m1, [%1+mmsize]
@@ -210,11 +210,11 @@ cextern pd_1024
     PSIGND      m3, m1
     mova [%1       ], m2
     mova [%1+mmsize], m3
-    ACCUM      por, 5, 2, %4
-    por         m5, m3
+    ACCUM      por, %5, 2, %4
+    ACCUM      por, %5, 3, %4+mmsize
 %else ; !sse4
-    QUANT_ONE_AC_MMX %1, %2, %3, %4
-    QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize
+    QUANT_ONE_AC_MMX %1, %2, %3, %4, %5
+    QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize, %5
 %endif ; cpuflag
 %endmacro
 
@@ -244,30 +244,58 @@ cglobal quant_%1x%2_dc, 3,3,8
 cglobal quant_%1x%2, 3,3,8
 %assign x 0
 %rep %1*%2/(mmsize/2)
-    QUANT_TWO_AC r0+x, r1+x, r2+x, x
+    QUANT_TWO_AC r0+x, r1+x, r2+x, x, 5
 %assign x x+mmsize*2
 %endrep
     QUANT_END
     RET
 %endmacro
 
+%macro QUANT_4x4 2
+    QUANT_TWO_AC r0+%1+mmsize*0, r1+mmsize*0, r2+mmsize*0, mmsize*0, %2
+    QUANT_TWO_AC r0+%1+mmsize*2, r1+mmsize*2, r2+mmsize*2, mmsize*2, %2
+%endmacro
+
+%macro QUANT_4x4x4 0
+cglobal quant_4x4x4, 3,3,8
+    QUANT_4x4  0, 5
+    QUANT_4x4 64, 6
+    add       r0, 128
+    packssdw  m5, m6
+    QUANT_4x4  0, 6
+    QUANT_4x4 64, 7
+    packssdw  m6, m7
+    packssdw  m5, m6
+    packssdw  m5, m5  ; AA BB CC DD
+    packsswb  m5, m5  ; A B C D
+    pxor      m4, m4
+    pcmpeqb   m5, m4
+    pmovmskb eax, m5
+    not      eax
+    and      eax, 0xf
+    RET
+%endmacro
+
 INIT_XMM sse2
 QUANT_DC 2, 2
 QUANT_DC 4, 4
 QUANT_AC 4, 4
 QUANT_AC 8, 8
+QUANT_4x4x4
 
 INIT_XMM ssse3
 QUANT_DC 2, 2
 QUANT_DC 4, 4
 QUANT_AC 4, 4
 QUANT_AC 8, 8
+QUANT_4x4x4
 
 INIT_XMM sse4
 QUANT_DC 2, 2
 QUANT_DC 4, 4
 QUANT_AC 4, 4
 QUANT_AC 8, 8
+QUANT_4x4x4
 
 %endif ; HIGH_BIT_DEPTH
 
@@ -285,7 +313,7 @@ QUANT_AC 8, 8
     ACCUM     por, 5, 0, %4
 %endmacro
 
-%macro QUANT_TWO 7
+%macro QUANT_TWO 8
     mova       m1, %1
     mova       m3, %2
     ABSW       m0, m1, sign
@@ -298,8 +326,8 @@ QUANT_AC 8, 8
     PSIGNW     m2, m3
     mova       %1, m0
     mova       %2, m2
-    ACCUM     por, 5, 0, %7
-    por        m5, m2
+    ACCUM     por, %8, 0, %7
+    ACCUM     por, %8, 2, %7+mmsize
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -313,7 +341,7 @@ cglobal %1, 1,1,%3
 %else
 %assign x 0
 %rep %2/2
-    QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x
+    QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x, 5
 %assign x x+mmsize*2
 %endrep
 %endif
@@ -328,13 +356,51 @@ cglobal %1, 1,1,%3
 cglobal %1, 3,3
 %assign x 0
 %rep %2/2
-    QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x
+    QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x, 5
 %assign x x+mmsize*2
 %endrep
     QUANT_END
     RET
 %endmacro
 
+%macro QUANT_4x4 2
+%if UNIX64
+    QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], m8, m9, m10, m11, mmsize*0, %2
+%else
+    QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], [r1+mmsize*0], [r1+mmsize*1], [r2+mmsize*0], [r2+mmsize*1], mmsize*0, %2
+%if mmsize==8
+    QUANT_TWO [r0+%1+mmsize*2], [r0+%1+mmsize*3], [r1+mmsize*2], [r1+mmsize*3], [r2+mmsize*2], [r2+mmsize*3], mmsize*2, %2
+%endif
+%endif
+%endmacro
+
+%macro QUANT_4x4x4 0
+cglobal quant_4x4x4, 3,3,7
+%if UNIX64
+    mova      m8, [r1+mmsize*0]
+    mova      m9, [r1+mmsize*1]
+    mova     m10, [r2+mmsize*0]
+    mova     m11, [r2+mmsize*1]
+%endif
+    QUANT_4x4  0, 4
+    QUANT_4x4 32, 5
+    packssdw  m4, m5
+    QUANT_4x4 64, 5
+    QUANT_4x4 96, 6
+    packssdw  m5, m6
+    packssdw  m4, m5
+%if mmsize == 16
+    packssdw  m4, m4  ; AA BB CC DD
+%endif
+    packsswb  m4, m4  ; A B C D
+    pxor      m3, m3
+    pcmpeqb   m4, m3
+    pmovmskb eax, m4
+    not      eax
+    and      eax, 0xf
+    RET
+%endmacro
+
 INIT_MMX mmx2
 QUANT_DC quant_2x2_dc, 1
 %if ARCH_X86_64 == 0 ; not needed because sse2 is faster
@@ -342,17 +408,20 @@ QUANT_DC quant_4x4_dc, 4
 INIT_MMX mmx
 QUANT_AC quant_4x4, 4
 QUANT_AC quant_8x8, 16
+QUANT_4x4x4
 %endif
 
 INIT_XMM sse2
 QUANT_DC quant_4x4_dc, 2, 8
 QUANT_AC quant_4x4, 2
 QUANT_AC quant_8x8, 8
+QUANT_4x4x4
 
 INIT_XMM ssse3
 QUANT_DC quant_4x4_dc, 2, 8
 QUANT_AC quant_4x4, 2
 QUANT_AC quant_8x8, 8
+QUANT_4x4x4
 
 INIT_MMX ssse3
 QUANT_DC quant_2x2_dc, 1
index e22b48586e366a48f403e920c8a7f6348f4e27b1..15b9aabd0d5f52ac293107680c27394a8431fc21 100644 (file)
 int x264_quant_2x2_dc_mmx2( dctcoef dct[4], int mf, int bias );
 int x264_quant_4x4_dc_mmx2( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_mmx( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+int x264_quant_4x4x4_mmx( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
 int x264_quant_8x8_mmx( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
 int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_dc_sse2( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_sse2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+int x264_quant_4x4x4_sse2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
 int x264_quant_8x8_sse2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
 int x264_quant_2x2_dc_ssse3( dctcoef dct[4], int mf, int bias );
 int x264_quant_4x4_dc_ssse3( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_ssse3( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+int x264_quant_4x4x4_ssse3( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
 int x264_quant_8x8_ssse3( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
 int x264_quant_2x2_dc_sse4( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_dc_sse4( dctcoef dct[16], int mf, int bias );
 int x264_quant_4x4_sse4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
+int x264_quant_4x4x4_sse4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
 int x264_quant_8x8_sse4( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
 void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_4x4dc_mmx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
index 1a9c7781b38249f968d906dd9633baf00532d659..27374328f19ff4b7ec721c3a5b3e7907ff8917fa 100644 (file)
@@ -157,28 +157,51 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
         return;
     }
 
+    M32( &h->mb.cache.non_zero_count[x264_scan8[ 0+p*16]] ) = 0;
+    M32( &h->mb.cache.non_zero_count[x264_scan8[ 2+p*16]] ) = 0;
+    M32( &h->mb.cache.non_zero_count[x264_scan8[ 8+p*16]] ) = 0;
+    M32( &h->mb.cache.non_zero_count[x264_scan8[10+p*16]] ) = 0;
+
     h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
 
-    for( int i = 0; i < 16; i++ )
+    if( h->mb.b_noise_reduction )
+        for( int idx = 0; idx < 16; idx++ )
+            h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
+
+    for( int idx = 0; idx < 16; idx++ )
     {
-        /* copy dc coeff */
-        if( h->mb.b_noise_reduction )
-            h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[0], h->nr_offset[0], 16 );
-        dct_dc4x4[block_idx_xy_1d[i]] = dct4x4[i][0];
-        dct4x4[i][0] = 0;
+        dct_dc4x4[block_idx_xy_1d[idx]] = dct4x4[idx][0];
+        dct4x4[idx][0] = 0;
+    }
 
-        /* quant/scan/dequant */
-        if( h->mb.b_trellis )
-            nz = x264_quant_4x4_trellis( h, dct4x4[i], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, i );
-        else
-            nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
-        h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz;
-        if( nz )
+    if( h->mb.b_trellis )
+    {
+        for( int idx = 0; idx < 16; idx++ )
+            if( x264_quant_4x4_trellis( h, dct4x4[idx], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, idx ) )
+            {
+                block_cbp = 0xf;
+                h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
+                h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
+                if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
+                h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
+            }
+    }
+    else
+    {
+        for( int i8x8 = 0; i8x8 < 4; i8x8++ )
         {
-            h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+i], dct4x4[i] );
-            h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[i_quant_cat], i_qp );
-            if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+i] );
-            block_cbp = 0xf;
+            nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
+            if( nz )
+            {
+                block_cbp = 0xf;
+                FOREACH_BIT( idx, i8x8*4, nz )
+                {
+                    h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
+                    h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
+                    if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
+                    h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
+                }
+            }
         }
     }
 
@@ -245,6 +268,18 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
     h->mb.i_cbp_chroma = 0;
     h->nr_count[2] += h->mb.b_noise_reduction * 4;
 
+    M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
+    M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
+    M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
+    M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
+    if( chroma422 )
+    {
+        M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0;
+        M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0;
+        M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0;
+        M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0;
+    }
+
     /* Early termination: check variance of chroma residual before encoding.
      * Don't bother trying early termination at low QPs.
      * Values are experimentally derived. */
@@ -259,17 +294,6 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
             score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
         if( score < thresh*4 )
         {
-            M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
-            M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
-            M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
-            M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
-            if( chroma422 )
-            {
-                M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0;
-                M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0;
-                M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0;
-                M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0;
-            }
             h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
             h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;
 
@@ -326,7 +350,7 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
     {
         pixel *p_src = h->mb.pic.p_fenc[1+ch];
         pixel *p_dst = h->mb.pic.p_fdec[1+ch];
-        int i_decimate_score = 0;
+        int i_decimate_score = b_decimate ? 0 : 7;
         int nz_ac = 0;
 
         ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );
@@ -361,20 +385,40 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
             dct2x2dc( dct_dc, dct4x4 );
 
         /* calculate dct coeffs */
-        for( int i = 0; i < (chroma422?8:4); i++ )
+        for( int i8x8 = 0; i8x8 < (chroma422?2:1); i8x8++ )
         {
             if( h->mb.b_trellis )
-                nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 );
+            {
+                for( int i4x4 = 0; i4x4 < 4; i4x4++ )
+                {
+                    if( x264_quant_4x4_trellis( h, dct4x4[i8x8*4+i4x4], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 ) )
+                    {
+                        int idx = 16+ch*16+i8x8*8+i4x4;
+                        h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] );
+                        h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp );
+                        if( i_decimate_score < 7 )
+                            i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] );
+                        h->mb.cache.non_zero_count[x264_scan8[idx]] = 1;
+                        nz_ac = 1;
+                    }
+                }
+            }
             else
-                nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
-            h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
-            if( nz )
             {
-                nz_ac = 1;
-                h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], dct4x4[i] );
-                h->quantf.dequant_4x4( dct4x4[i], dequant_mf, i_qp );
-                if( b_decimate )
-                    i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16] );
+                nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4IC+b_inter][i_qp],
+                                            h->quant4_bias[CQM_4IC+b_inter][i_qp] );
+                nz_ac |= nz;
+
+                FOREACH_BIT( i4x4, 0, nz )
+                {
+                    int idx = 16+ch*16+i8x8*8+i4x4;
+
+                    h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] );
+                    h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp );
+                    if( i_decimate_score < 7 )
+                        i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] );
+                    h->mb.cache.non_zero_count[x264_scan8[idx]] = 1;
+                }
             }
         }
 
@@ -390,7 +434,7 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
 
         h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc;
 
-        if( (b_decimate && i_decimate_score < 7) || !nz_ac )
+        if( i_decimate_score < 7 || !nz_ac )
         {
             /* Decimate the block */
             M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0;
@@ -646,11 +690,8 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
     {
         h->mb.b_transform_8x8 = 0;
 
-        for( int p = 0; p < plane_count; p++ )
-        {
+        for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
             x264_mb_encode_i16x16( h, p, i_qp );
-            i_qp = h->mb.i_chroma_qp;
-        }
     }
     else if( h->mb.i_type == I_8x8 )
     {
@@ -668,14 +709,13 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
             if( h->mb.i_skip_intra == 2 )
                 h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
         }
-        for( int p = 0; p < plane_count; p++ )
+        for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
         {
             for( int i = (p == 0 && h->mb.i_skip_intra) ? 3 : 0 ; i < 4; i++ )
             {
                 int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
                 x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL, 1 );
             }
-            i_qp = h->mb.i_chroma_qp;
         }
     }
     else if( h->mb.i_type == I_4x4 )
@@ -694,7 +734,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
             if( h->mb.i_skip_intra == 2 )
                 h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
         }
-        for( int p = 0; p < plane_count; p++ )
+        for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
         {
             for( int i = (p == 0 && h->mb.i_skip_intra) ? 15 : 0 ; i < 16; i++ )
             {
@@ -707,7 +747,6 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
 
                 x264_mb_encode_i4x4( h, p, i, i_qp, i_mode, 1 );
             }
-            i_qp = h->mb.i_chroma_qp;
         }
     }
     else    /* Inter MB */
@@ -747,8 +786,9 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
             ALIGNED_ARRAY_16( dctcoef, dct8x8,[4],[64] );
             b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
 
-            for( int p = 0; p < plane_count; p++ )
+            for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
             {
+                CLEAR_16x16_NNZ( p );
                 h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
                 h->nr_count[1+!!p*2] += h->mb.b_noise_reduction * 4;
 
@@ -772,99 +812,92 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
                     }
                 }
 
-                if( i_decimate_mb < 6 && b_decimate )
-                {
-                    plane_cbp = 0;
-                    CLEAR_16x16_NNZ( p );
-                }
-                else
+                if( i_decimate_mb >= 6 || !b_decimate )
                 {
-                    for( int idx = 0; idx < 4; idx++ )
+                    h->mb.i_cbp_luma |= plane_cbp;
+                    FOREACH_BIT( idx, 0, plane_cbp )
                     {
-                        int x = idx&1;
-                        int y = idx>>1;
-
-                        if( plane_cbp&(1<<idx) )
-                        {
-                            h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[p?CQM_8PC:CQM_8PY], i_qp );
-                            h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
-                            STORE_8x8_NNZ( p, idx, 1 );
-                        }
-                        else
-                            STORE_8x8_NNZ( p, idx, 0 );
+                        h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[p?CQM_8PC:CQM_8PY], i_qp );
+                        h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*(idx&1) + 8*(idx>>1)*FDEC_STRIDE], dct8x8[idx] );
+                        STORE_8x8_NNZ( p, idx, 1 );
                     }
                 }
-                h->mb.i_cbp_luma |= plane_cbp;
-                i_qp = h->mb.i_chroma_qp;
             }
         }
         else
         {
             ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
-            for( int p = 0; p < plane_count; p++ )
+            for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
             {
+                CLEAR_16x16_NNZ( p );
                 h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
-                h->nr_count[0+!!p*2] += h->mb.b_noise_reduction * 16;
+
+                if( h->mb.b_noise_reduction )
+                {
+                    h->nr_count[0+!!p*2] += 16;
+                    for( int idx = 0; idx < 16; idx++ )
+                        h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
+                }
 
                 int plane_cbp = 0;
                 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
                 {
-                    int i_decimate_8x8 = 0;
-                    int cbp = 0;
-
-                    /* encode one 4x4 block */
-                    for( int i4x4 = 0; i4x4 < 4; i4x4++ )
+                    int i_decimate_8x8 = b_decimate ? 0 : 6;
+                    int nnz8x8 = 0;
+                    if( h->mb.b_trellis )
                     {
-                        int idx = i8x8 * 4 + i4x4;
-
-                        nz = x264_quant_4x4( h, dct4x4[idx], i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, idx );
-                        h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
-
+                        for( int i4x4 = 0; i4x4 < 4; i4x4++ )
+                        {
+                            int idx = i8x8*4+i4x4;
+                            if( x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, p*16+idx ) )
+                            {
+                                h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
+                                h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp );
+                                if( i_decimate_8x8 < 6 )
+                                    i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
+                                h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1;
+                                nnz8x8 = 1;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        nnz8x8 = nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
                         if( nz )
                         {
-                            h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
-                            h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp );
-                            if( b_decimate && i_decimate_8x8 < 6 )
-                                i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
-                            cbp = 1;
+                            FOREACH_BIT( idx, i8x8*4, nz )
+                            {
+                                h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
+                                h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp );
+                                if( i_decimate_8x8 < 6 )
+                                    i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
+                                h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1;
+                            }
                         }
                     }
-
-                    int x = i8x8&1;
-                    int y = i8x8>>1;
-
-                    /* decimate this 8x8 block */
-                    i_decimate_mb += i_decimate_8x8;
-                    if( b_decimate )
+                    if( nnz8x8 )
                     {
+                        i_decimate_mb += i_decimate_8x8;
                         if( i_decimate_8x8 < 4 )
                             STORE_8x8_NNZ( p, i8x8, 0 );
                         else
                             plane_cbp |= 1<<i8x8;
                     }
-                    else if( cbp )
-                    {
-                        h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE], &dct4x4[i8x8*4] );
-                        plane_cbp |= 1<<i8x8;
-                    }
                 }
 
-                if( b_decimate )
+                if( i_decimate_mb < 6 )
                 {
-                    if( i_decimate_mb < 6 )
-                    {
-                        plane_cbp = 0;
-                        CLEAR_16x16_NNZ( p );
-                    }
-                    else
+                    plane_cbp = 0;
+                    CLEAR_16x16_NNZ( p );
+                }
+                else
+                {
+                    h->mb.i_cbp_luma |= plane_cbp;
+                    FOREACH_BIT( i8x8, 0, plane_cbp )
                     {
-                        for( int i8x8 = 0; i8x8 < 4; i8x8++ )
-                            if( plane_cbp&(1<<i8x8) )
-                                h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
+                        h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
                     }
                 }
-                h->mb.i_cbp_luma |= plane_cbp;
-                i_qp = h->mb.i_chroma_qp;
             }
         }
     }
@@ -938,7 +971,7 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b
     ALIGNED_4( int16_t mvp[2] );
     int i_qp = h->mb.i_qp;
 
-    for( int p = 0; p < plane_count; p++ )
+    for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
     {
         int quant_cat = p ? CQM_4PC : CQM_4PY;
         if( !b_bidir )
@@ -957,23 +990,23 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b
         {
             int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8;
             int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8;
-            /* get luma diff */
+
             h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[p] + fenc_offset,
                                         h->mb.pic.p_fdec[p] + fdec_offset );
-            /* encode one 4x4 block */
-            for( int i4x4 = 0; i4x4 < 4; i4x4++ )
-            {
-                if( h->mb.b_noise_reduction )
+
+            if( h->mb.b_noise_reduction )
+                for( int i4x4 = 0; i4x4 < 4; i4x4++ )
                     h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
-                if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] ) )
-                    continue;
-                h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
+
+            int nz = h->quantf.quant_4x4x4( dct4x4, h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] );
+            FOREACH_BIT( idx, 0, nz )
+            {
+                h->zigzagf.scan_4x4( dctscan, dct4x4[idx] );
                 i_decimate_mb += h->quantf.decimate_score16( dctscan );
                 if( i_decimate_mb >= 6 )
                     return 0;
             }
         }
-        i_qp = h->mb.i_chroma_qp;
     }
 
     if( chroma == CHROMA_420 || chroma == CHROMA_422 )
@@ -1023,6 +1056,7 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b
                 {
                     h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
                     dct_dc[i4x4] = dct4x4[i4x4][0];
+                    dct4x4[i4x4][0] = 0;
                 }
             }
             else
@@ -1043,21 +1077,26 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b
                 continue;
 
             if( !h->mb.b_noise_reduction )
-               for( int i = 0; i <= chroma422; i++ )
+                for( int i = 0; i <= chroma422; i++ )
+                {
                     h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
+                    dct4x4[i*4+0][0] = 0;
+                    dct4x4[i*4+1][0] = 0;
+                    dct4x4[i*4+2][0] = 0;
+                    dct4x4[i*4+3][0] = 0;
+                }
 
             /* calculate dct coeffs */
-            for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < (chroma422?8:4); i4x4++ )
+            for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < (chroma422?2:1); i8x8++ )
             {
-                dct4x4[i4x4][0] = 0;
-                if( h->mb.b_noise_reduction )
-                    h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
-                if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
-                    continue;
-                h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
-                i_decimate_mb += h->quantf.decimate_score15( dctscan );
-                if( i_decimate_mb >= 7 )
-                    return 0;
+                int nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
+                FOREACH_BIT( idx, i8x8*4, nz )
+                {
+                    h->zigzagf.scan_4x4( dctscan, dct4x4[idx] );
+                    i_decimate_mb += h->quantf.decimate_score15( dctscan );
+                    if( i_decimate_mb >= 7 )
+                        return 0;
+                }
             }
         }
     }
@@ -1176,12 +1215,13 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
     {
         if( h->mb.b_transform_8x8 )
         {
-            for( int p = 0; p < plane_count; p++ )
+            for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
             {
                 int quant_cat = p ? CQM_8PC : CQM_8PY;
                 pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
                 pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
                 ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
+
                 h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
                 int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
                 if( nnz8x8 )
@@ -1196,50 +1236,74 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
                         h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[quant_cat], i_qp );
                         h->dctf.add8x8_idct8( p_fdec, dct8x8 );
                         STORE_8x8_NNZ( p, i8, 1 );
+                        h->mb.i_cbp_luma |= 1 << i8;
                     }
                     else
                         STORE_8x8_NNZ( p, i8, 0 );
                 }
                 else
                     STORE_8x8_NNZ( p, i8, 0 );
-                h->mb.i_cbp_luma |= nnz8x8 << i8;
-                i_qp = h->mb.i_chroma_qp;
             }
         }
         else
         {
-            for( int p = 0; p < plane_count; p++ )
+            for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
             {
                 int quant_cat = p ? CQM_4PC : CQM_4PY;
                 pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
                 pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
-                int i_decimate_8x8 = 0, nnz8x8 = 0;
+                int i_decimate_8x8 = b_decimate ? 0 : 4;
                 ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
+                int nnz8x8 = 0;
+
                 h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
-                for( int i4 = 0; i4 < 4; i4++ )
+                STORE_8x8_NNZ( p, i8, 0 );
+
+                if( h->mb.b_noise_reduction )
+                    for( int idx = 0; idx < 4; idx++ )
+                        h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
+
+                if( h->mb.b_trellis )
                 {
-                    nz = x264_quant_4x4( h, dct4x4[i4], i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i8*4+i4 );
-                    h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4]] = nz;
+                    for( int i4x4 = 0; i4x4 < 4; i4x4++ )
+                    {
+                        if( x264_quant_4x4_trellis( h, dct4x4[i4x4], quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, i8*4+i4x4+p*16 ) )
+                        {
+                            h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4x4], dct4x4[i4x4] );
+                            h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[quant_cat], i_qp );
+                            if( i_decimate_8x8 < 4 )
+                                i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4x4] );
+                            h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4x4]] = 1;
+                            nnz8x8 = 1;
+                        }
+                    }
+                }
+                else
+                {
+                    nnz8x8 = nz = h->quantf.quant_4x4x4( dct4x4, h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] );
                     if( nz )
                     {
-                        h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4], dct4x4[i4] );
-                        h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[quant_cat], i_qp );
-                        if( b_decimate )
-                            i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4] );
-                        nnz8x8 = 1;
+                        FOREACH_BIT( i4x4, 0, nz )
+                        {
+                            h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4x4], dct4x4[i4x4] );
+                            h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[quant_cat], i_qp );
+                            if( i_decimate_8x8 < 4 )
+                                i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4x4] );
+                            h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4x4]] = 1;
+                        }
                     }
                 }
-
-                if( b_decimate && i_decimate_8x8 < 4 )
-                    nnz8x8 = 0;
-
                 if( nnz8x8 )
-                    h->dctf.add8x8_idct( p_fdec, dct4x4 );
-                else
-                    STORE_8x8_NNZ( p, i8, 0 );
-
-                h->mb.i_cbp_luma |= nnz8x8 << i8;
-                i_qp = h->mb.i_chroma_qp;
+                {
+                    /* decimate this 8x8 block */
+                    if( i_decimate_8x8 < 4 )
+                        STORE_8x8_NNZ( p, i8, 0 );
+                    else
+                    {
+                        h->dctf.add8x8_idct( p_fdec, dct4x4 );
+                        h->mb.i_cbp_luma |= 1 << i8;
+                    }
+                }
             }
         }
 
@@ -1297,7 +1361,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p4x4_internal( x264_t *h, int i
 {
     int i_qp = h->mb.i_qp;
 
-    for( int p = 0; p < plane_count; p++ )
+    for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
     {
         int quant_cat = p ? CQM_4PC : CQM_4PY;
         pixel *p_fenc = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[i4]];
@@ -1324,7 +1388,6 @@ static ALWAYS_INLINE void x264_macroblock_encode_p4x4_internal( x264_t *h, int i
                 h->dctf.add4x4_idct( p_fdec, dct4x4 );
             }
         }
-        i_qp = h->mb.i_chroma_qp;
     }
 }
 
index fda609d8148b6f8b452bbd4e72d819b26db2cb77..f0342cf33c68b6203c624000e93293193827e943 100644 (file)
@@ -104,6 +104,10 @@ do\
     M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+10]] ) = 0;\
 } while(0)
 
+/* A special for loop that iterates branchlessly over each set
+ * bit in a 4-bit input. */
+#define FOREACH_BIT(idx,start,mask) for( int idx = start, msk = mask, skip; msk && (skip = x264_ctz_4bit(msk), idx += skip, msk >>= skip+1, 1); idx++ )
+
 static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode, int b_predict )
 {
     int nz;
index 6c8c2c5ed6ea39e1398e8458e4a6362c284618e5..f847d478da8b912c732f5f23da556fb8b46550ab 100644 (file)
@@ -1161,5 +1161,6 @@ int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
         h->mb.cache.non_zero_count[x264_scan8[idx*4+i]] = nz;
         nzaccum |= nz;
     }
+    STORE_8x8_NNZ( 0, idx, 0 );
     return nzaccum;
 }
index ad22dec316a0ef2ee9a2fe1dda4a2fc5cc39c051..2f50967d2b5bcf87233bc9718ca44932e848a351 100644 (file)
@@ -1778,23 +1778,23 @@ static int check_quant( int cpu_ref, int cpu_new )
         x264_quant_init( h, cpu_ref, &qf_ref );
         x264_quant_init( h, cpu_new, &qf_a );
 
-#define INIT_QUANT8(j) \
+#define INIT_QUANT8(j,max) \
         { \
             static const int scale1d[8] = {32,31,24,31,32,31,24,31}; \
-            for( int i = 0; i < 64; i++ ) \
+            for( int i = 0; i < max; i++ ) \
             { \
-                unsigned int scale = (255*scale1d[i>>3]*scale1d[i&7])/16; \
-                dct1[i] = dct2[i] = j ? (rand()%(2*scale+1))-scale : 0; \
+                unsigned int scale = (255*scale1d[(i>>3)&7]*scale1d[i&7])/16; \
+                dct1[i] = dct2[i] = (j>>(i>>6))&1 ? (rand()%(2*scale+1))-scale : 0; \
             } \
         }
 
-#define INIT_QUANT4(j) \
+#define INIT_QUANT4(j,max) \
         { \
             static const int scale1d[4] = {4,6,4,6}; \
-            for( int i = 0; i < 16; i++ ) \
+            for( int i = 0; i < max; i++ ) \
             { \
-                unsigned int scale = 255*scale1d[i>>2]*scale1d[i&3]; \
-                dct1[i] = dct2[i] = j ? (rand()%(2*scale+1))-scale : 0; \
+                unsigned int scale = 255*scale1d[(i>>2)&3]*scale1d[i&3]; \
+                dct1[i] = dct2[i] = (j>>(i>>4))&1 ? (rand()%(2*scale+1))-scale : 0; \
             } \
         }
 
@@ -1824,34 +1824,36 @@ static int check_quant( int cpu_ref, int cpu_new )
             } \
         }
 
-#define TEST_QUANT( qname, block, w ) \
+#define TEST_QUANT( qname, block, type, w, maxj ) \
         if( qf_a.qname != qf_ref.qname ) \
         { \
             set_func_name( #qname ); \
             used_asms[0] = 1; \
             for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
             { \
-                for( int j = 0; j < 2; j++ ) \
+                for( int j = 0; j < maxj; j++ ) \
                 { \
-                    INIT_QUANT##w(j) \
-                    int result_c = call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
-                    int result_a = call_a1( qf_a.qname, dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+                    INIT_QUANT##type(j, w*w) \
+                    int result_c = call_c1( qf_c.qname, (void*)dct1, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
+                    int result_a = call_a1( qf_a.qname, (void*)dct2, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
                     if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) || result_c != result_a ) \
                     { \
                         oks[0] = 0; \
                         fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
                         break; \
                     } \
-                    call_c2( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
-                    call_a2( qf_a.qname, dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+                    call_c2( qf_c.qname, (void*)dct1, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
+                    call_a2( qf_a.qname, (void*)dct2, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
                 } \
             } \
         }
 
-        TEST_QUANT( quant_8x8, CQM_8IY, 8 );
-        TEST_QUANT( quant_8x8, CQM_8PY, 8 );
-        TEST_QUANT( quant_4x4, CQM_4IY, 4 );
-        TEST_QUANT( quant_4x4, CQM_4PY, 4 );
+        TEST_QUANT( quant_8x8, CQM_8IY, 8, 8, 2 );
+        TEST_QUANT( quant_8x8, CQM_8PY, 8, 8, 2 );
+        TEST_QUANT( quant_4x4, CQM_4IY, 4, 4, 2 );
+        TEST_QUANT( quant_4x4, CQM_4PY, 4, 4, 2 );
+        TEST_QUANT( quant_4x4x4, CQM_4IY, 4, 8, 16 );
+        TEST_QUANT( quant_4x4x4, CQM_4PY, 4, 8, 16 );
         TEST_QUANT_DC( quant_4x4_dc, **h->quant4_mf[CQM_4IY] );
         TEST_QUANT_DC( quant_2x2_dc, **h->quant4_mf[CQM_4IC] );
 
@@ -1862,7 +1864,7 @@ static int check_quant( int cpu_ref, int cpu_new )
             used_asms[1] = 1; \
             for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
             { \
-                INIT_QUANT##w(1) \
+                INIT_QUANT##w(1, w*w) \
                 qf_c.qname( dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
                 memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
                 call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \