]> git.sesse.net Git - x264/commitdiff
Significantly faster CABAC and CAVLC residual coding and bit cost calculation
authorFiona Glaser <fiona@x264.com>
Fri, 28 Nov 2008 03:37:56 +0000 (19:37 -0800)
committerFiona Glaser <fiona@x264.com>
Fri, 28 Nov 2008 06:04:35 +0000 (22:04 -0800)
Early-terminate in residual writing using stored nnz counts
To allow the above, store nnz counts for luma and chroma DC
Add assembly functions to find the last nonzero coefficient in a block
Overall ~1.9% faster at subme9+8x8dct+qp25 with CAVLC, ~0.7% faster with CABAC
Note this changes output slightly with CABAC RDO because it requires always storing correct nnz values during RDO, which wasn't done before in cases it wasn't useful.
CAVLC output should be equivalent.

common/common.h
common/quant.c
common/quant.h
common/x86/quant-a.asm
common/x86/quant.h
encoder/cabac.c
encoder/cavlc.c
encoder/macroblock.c
tools/checkasm.c

index 1a260f9d8cea1ac13d37ae7f2df52d94e1c345cf..c8405fc3242a5db29be5a32b7eb37d7dbeda2134 100644 (file)
@@ -213,7 +213,7 @@ typedef struct
 #define X264_SCAN8_SIZE (6*8)
 #define X264_SCAN8_0 (4+1*8)
 
-static const int x264_scan8[16+2*4] =
+static const int x264_scan8[16+2*4+3] =
 {
     /* Luma */
     4+1*8, 5+1*8, 4+2*8, 5+2*8,
@@ -228,6 +228,12 @@ static const int x264_scan8[16+2*4] =
     /* Cr */
     1+4*8, 2+4*8,
     1+5*8, 2+5*8,
+
+    /* Luma DC */
+    4+5*8,
+
+    /* Chroma DC */
+    5+5*8, 6+5*8
 };
 /*
    0 1 2 3 4 5 6 7
@@ -236,7 +242,7 @@ static const int x264_scan8[16+2*4] =
  2   B B   L L L L
  3         L L L L
  4   R R   L L L L
- 5   R R
+ 5   R R   DyDuDv
 */
 
 typedef struct x264_ratecontrol_t   x264_ratecontrol_t;
index 42244e36e9b7af429eea3aca8c0f171d713322a4..ee7b9485c5750773598b472bc04af4cdd65b82a8 100644 (file)
@@ -245,6 +245,34 @@ static int x264_decimate_score64( int16_t *dct )
     return x264_decimate_score_internal( dct, 64 );
 }
 
+static int ALWAYS_INLINE x264_coeff_last_internal( int16_t *l, int i_count )
+{
+    int i_last;
+    for( i_last = i_count-1; i_last >= 3; i_last -= 4 )
+        if( *(uint64_t*)(l+i_last-3) )
+            break;
+    while( i_last >= 0 && l[i_last] == 0 )
+        i_last--;
+    return i_last;
+}
+
+static int x264_coeff_last4( int16_t *l )
+{
+    return x264_coeff_last_internal( l, 4 );
+}
+static int x264_coeff_last15( int16_t *l )
+{
+    return x264_coeff_last_internal( l, 15 );
+}
+static int x264_coeff_last16( int16_t *l )
+{
+    return x264_coeff_last_internal( l, 16 );
+}
+static int x264_coeff_last64( int16_t *l )
+{
+    return x264_coeff_last_internal( l, 64 );
+}
+
 void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
 {
     pf->quant_8x8 = quant_8x8;
@@ -261,6 +289,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
     pf->decimate_score16 = x264_decimate_score16;
     pf->decimate_score64 = x264_decimate_score64;
 
+    pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4;
+    pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15;
+    pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16;
+    pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64;
+
 #ifdef HAVE_MMX
     if( cpu&X264_CPU_MMX )
     {
@@ -287,7 +320,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->decimate_score15 = x264_decimate_score15_mmxext;
         pf->decimate_score16 = x264_decimate_score16_mmxext;
         pf->decimate_score64 = x264_decimate_score64_mmxext;
+        pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmxext;
+        pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext;
+        pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmxext;
 #endif
+        pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext;
     }
 
     if( cpu&X264_CPU_SSE2 )
@@ -307,6 +344,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->decimate_score15 = x264_decimate_score15_sse2;
         pf->decimate_score16 = x264_decimate_score16_sse2;
         pf->decimate_score64 = x264_decimate_score64_sse2;
+        pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
+        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
+        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
     }
 
     if( cpu&X264_CPU_SSSE3 )
@@ -333,4 +373,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->dequant_8x8 = x264_dequant_8x8_altivec;
     }
 #endif
+    pf->coeff_last[  DCT_LUMA_DC] = pf->coeff_last[DCT_LUMA_4x4];
+    pf->coeff_last[DCT_CHROMA_AC] = pf->coeff_last[ DCT_LUMA_AC];
 }
index 3b128e6d3b27ad62e945b8e9a0c1c3db4c1cc659..dabd60cef4a047ce8ae8bfc6528e4bb2c357b49c 100644 (file)
@@ -39,6 +39,7 @@ typedef struct
     int (*decimate_score15)( int16_t *dct );
     int (*decimate_score16)( int16_t *dct );
     int (*decimate_score64)( int16_t *dct );
+    int (*coeff_last[6])( int16_t *dct );
 } x264_quant_function_t;
 
 void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
index 0abb906cb8c81b754303728b92a148ddb104dbad..6bedf549d1b2ebd5eddf5c40a82d58acc0d28b12 100644 (file)
@@ -671,3 +671,107 @@ INIT_XMM
 DECIMATE8x8 sse2
 DECIMATE8x8 ssse3
 
+%macro LAST_MASK_SSE2 2-3
+    movdqa   xmm0, [%2+ 0]
+    pxor     xmm2, xmm2
+    packsswb xmm0, [%2+16]
+    pcmpeqb  xmm0, xmm2
+    pmovmskb   %1, xmm0
+%endmacro
+
+%macro LAST_MASK_MMX 3
+    movq     mm0, [%2+ 0]
+    movq     mm1, [%2+16]
+    pxor     mm2, mm2
+    packsswb mm0, [%2+ 8]
+    packsswb mm1, [%2+24]
+    pcmpeqb  mm0, mm2
+    pcmpeqb  mm1, mm2
+    pmovmskb  %1, mm0
+    pmovmskb  %3, mm1
+    shl       %3, 8
+    or        %1, %3
+%endmacro
+
+%ifdef ARCH_X86_64
+cglobal x264_coeff_last4_mmxext, 1,1
+    bsr rax, [r0]
+    shr eax, 4
+    RET
+%else
+cglobal x264_coeff_last4_mmxext, 0,3
+    mov   edx, r0m
+    mov   eax, [edx+4]
+    xor   ecx, ecx
+    test  eax, eax
+    cmovz eax, [edx]
+    setnz cl
+    bsr   eax, eax
+    shr   eax, 4
+    lea   eax, [eax+ecx*2]
+    RET
+%endif
+
+%macro COEFF_LAST 1
+cglobal x264_coeff_last15_%1, 1,3
+    LAST_MASK r1d, r0-2, r2d
+    xor r1d, 0xffff
+    bsr eax, r1d
+    dec eax
+    RET
+
+cglobal x264_coeff_last16_%1, 1,3
+    LAST_MASK r1d, r0, r2d
+    xor r1d, 0xffff
+    bsr eax, r1d
+    RET
+
+%ifndef ARCH_X86_64
+%ifidn %1, mmxext
+    cglobal x264_coeff_last64_%1, 1,5
+%else
+    cglobal x264_coeff_last64_%1, 1,4
+%endif
+    LAST_MASK r1d, r0, r4d
+    LAST_MASK r2d, r0+32, r4d
+    shl r2d, 16
+    or  r1d, r2d
+    LAST_MASK r2d, r0+64, r4d
+    LAST_MASK r3d, r0+96, r4d
+    shl r3d, 16
+    or  r2d, r3d
+    not r1d
+    xor r2d, -1
+    jne .secondhalf
+    bsr eax, r1d
+    RET
+.secondhalf:
+    bsr eax, r2d
+    add eax, 32
+    RET
+%endif
+%endmacro
+
+%ifdef ARCH_X86_64
+    cglobal x264_coeff_last64_sse2, 1,4
+    LAST_MASK_SSE2 r1d, r0
+    LAST_MASK_SSE2 r2d, r0+32
+    LAST_MASK_SSE2 r3d, r0+64
+    LAST_MASK_SSE2 r0d, r0+96
+    shl r2d, 16
+    shl r0d, 16
+    or  r1d, r2d
+    or  r3d, r0d
+    shl r3,  32
+    or  r1,  r3
+    not r1
+    bsr rax, r1
+    RET
+%endif
+
+%ifndef ARCH_X86_64
+%define LAST_MASK LAST_MASK_MMX
+COEFF_LAST mmxext
+%endif
+%define LAST_MASK LAST_MASK_SSE2
+COEFF_LAST sse2
index 29cb76decd8e5078d25182e3e1f631ee9e80f3c5..8e9dbb65301b38674896699923b8ee9f238aec2d 100644 (file)
@@ -57,5 +57,12 @@ int x264_decimate_score16_ssse3 ( int16_t *dct );
 int x264_decimate_score64_mmxext( int16_t *dct );
 int x264_decimate_score64_sse2  ( int16_t *dct );
 int x264_decimate_score64_ssse3 ( int16_t *dct );
+int x264_coeff_last4_mmxext( int16_t *dct );
+int x264_coeff_last15_mmxext( int16_t *dct );
+int x264_coeff_last16_mmxext( int16_t *dct );
+int x264_coeff_last64_mmxext( int16_t *dct );
+int x264_coeff_last15_sse2( int16_t *dct );
+int x264_coeff_last16_sse2( int16_t *dct );
+int x264_coeff_last64_sse2( int16_t *dct );
 
 #endif
index 3dbc8fc5ddb1cba8c2d863a7714e2384990dc591..0768de718d552b91a688739154517e50eeb636bf 100644 (file)
@@ -595,6 +595,7 @@ static int x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx )
             break;
         case DCT_CHROMA_DC:
             /* no need to test skip/pcm */
+            i_idx -= 25;
             if( h->mb.i_neighbour & MB_LEFT )
             {
                 i_mba_xy = h->mb.i_mb_xy - 1;
@@ -684,23 +685,18 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
     const uint8_t *significant_coeff_flag_offset = significant_coeff_flag_offset_8x8[h->mb.b_interlaced];
 
     int i_coeff_abs_m1[64];
-    int UNUSED i_coeff_sign[64];
+    int i_coeff_sign[64];
     int i_coeff = 0;
-    int i_last  = 0;
+    int i_last;
     int i_sigmap_size;
     int node_ctx = 0;
-    int i, j;
-
-    /* yes this is always aligned, and l[-1] exists in the cases where it's used (ac) */
-    for( j = i_count - 4; j >= -1; j -= 4 )
-        if( *(uint64_t*)(l+j) )
-            break;
+    int i;
 
     if( i_count != 64 )
     {
         /* coded block flag */
         int ctx = 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx );
-        if( j >= -1 )
+        if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )
             x264_cabac_encode_decision( cb, ctx, 1 );
         else
         {
@@ -709,9 +705,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
         }
     }
 
-    for( i = j; i < j+4; i++)
-        if( l[i] )
-            i_last = i;
+    i_last = h->quantf.coeff_last[i_ctxBlockCat](l);
 
     i_sigmap_size = X264_MIN( i_last+1, i_count-1 );
 
@@ -722,7 +716,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
         {\
             i_coeff_abs_m1[i_coeff] = abs(l[i]) - 1;\
             if( !RDO_SKIP_BS )\
-                i_coeff_sign[i_coeff]   = l[i] < 0;\
+                i_coeff_sign[i_coeff] = l[i] < 0;\
             i_coeff++;\
             x264_cabac_encode_decision( cb, i_ctx_sig + (l8x8 ? significant_coeff_flag_offset[i] : i), 1 );\
             x264_cabac_encode_decision( cb, i_ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), i == i_last );\
@@ -762,7 +756,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
             cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]];
             cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]];
 #else
-            for( j = 0; j < i_prefix - 1; j++ )
+            for( i = 0; i < i_prefix - 1; i++ )
                 x264_cabac_encode_decision( cb, ctx, 1 );
             if( i_prefix < 14 )
                 x264_cabac_encode_decision( cb, ctx, 0 );
@@ -1002,7 +996,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
         if( i_mb_type == I_16x16 )
         {
             /* DC Luma */
-            block_residual_write_cabac( h, cb, DCT_LUMA_DC, 0, h->dct.luma16x16_dc, 16 );
+            block_residual_write_cabac( h, cb, DCT_LUMA_DC, 24, h->dct.luma16x16_dc, 16 );
 
             /* AC Luma */
             if( h->mb.i_cbp_luma != 0 )
@@ -1024,8 +1018,8 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
 
         if( h->mb.i_cbp_chroma &0x03 )    /* Chroma DC residual present */
         {
-            block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 0, h->dct.chroma_dc[0], 4 );
-            block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 1, h->dct.chroma_dc[1], 4 );
+            block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4 );
+            block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 );
         }
         if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
         {
@@ -1078,15 +1072,29 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
         if( h->mb.i_cbp_luma & (1 << i8) )
         {
             if( h->mb.b_transform_8x8 )
+            {
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101;
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
                 block_residual_write_cabac( h, cb, DCT_LUMA_8x8, i8, h->dct.luma8x8[i8], 64 );
+            }
             else
             {
                 int i4;
                 for( i4 = 0; i4 < 4; i4++ )
+                {
+                    h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero( h->dct.luma4x4[i4+i8*4] );
                     block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
+                }
             }
         }
+        else
+        {
+            *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0;
+            *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
+        }
 
+        h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero( h->dct.luma4x4[16+i8] );
+        h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero( h->dct.luma4x4[20+i8] );
         block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
         block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
 
@@ -1097,12 +1105,14 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
 static void x264_subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_pixel )
 {
     int b_8x4 = i_pixel == PIXEL_8x4;
+    h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4+2-b_8x4] );
     block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
     if( i_pixel == PIXEL_4x4 )
         x264_cabac_mb_mvd( h, cb, 0, i4, 1, 1 );
     else
     {
         x264_cabac_mb_mvd( h, cb, 0, i4, 1+b_8x4, 2-b_8x4 );
+        h->mb.cache.non_zero_count[x264_scan8[i4+2-b_8x4]] = array_non_zero( h->dct.luma4x4[i4+2-b_8x4] );
         block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+2-b_8x4, h->dct.luma4x4[i4+2-b_8x4], 16 );
     }
 }
@@ -1115,9 +1125,9 @@ static void x264_partition_i8x8_size_cabac( x264_t *h, x264_cabac_t *cb, int i8,
     x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
     if( nnz )
     {
-        block_residual_write_cabac( h, cb, DCT_LUMA_8x8, 4*i8, h->dct.luma8x8[i8], 64 );
         *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4]] = 0x0101;
         *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
+        block_residual_write_cabac( h, cb, DCT_LUMA_8x8, 4*i8, h->dct.luma8x8[i8], 64 );
     }
     else
     {
@@ -1131,8 +1141,8 @@ static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4,
     const int i_pred = x264_mb_predict_intra4x4_mode( h, i4 );
     i_mode = x264_mb_pred_mode4x4_fix( i_mode );
     x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
-    block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
     h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
+    block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
 }
 
 static void x264_i8x8_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
@@ -1141,8 +1151,8 @@ static void x264_i8x8_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
     x264_cabac_mb_cbp_chroma( h, cb );
     if( h->mb.i_cbp_chroma > 0 )
     {
-        block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 0, h->dct.chroma_dc[0], 4 );
-        block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 1, h->dct.chroma_dc[1], 4 );
+        block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4 );
+        block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 );
 
         if( h->mb.i_cbp_chroma == 2 )
         {
index 483df9b181b5a81a259e5b4ebda8506a1c50441c..49c9c7ff167f068584f652e1d1027ba4fca35344 100644 (file)
@@ -55,9 +55,6 @@ static const uint8_t sub_mb_type_b_to_golomb[13]=
     10,  4,  5,  1, 11,  6,  7,  2, 12,  8,  9,  3,  0
 };
 
-#define BLOCK_INDEX_CHROMA_DC   (-1)
-#define BLOCK_INDEX_LUMA_DC     (-2)
-
 static inline void bs_write_vlc( bs_t *s, vlc_t v )
 {
     bs_write( s, v.i_size, v.i_bits );
@@ -66,71 +63,59 @@ static inline void bs_write_vlc( bs_t *s, vlc_t v )
 /****************************************************************************
  * block_residual_write_cavlc:
  ****************************************************************************/
-static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_idx, int16_t *l, int i_count )
+static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
 {
+    static const int ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
     int level[16], run[16];
     int i_total, i_trailing;
     int i_total_zero;
     int i_last;
     unsigned int i_sign;
     int i;
+    int idx = 0;
     int i_suffix_length;
+    /* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
+    int nC = i_idx >= 25 ? 4 : ct_index[x264_mb_predict_non_zero_code( h, i_idx == 24 ? 0 : i_idx )];
 
-    /* first find i_last */
-    for( i_last = i_count-1; i_last >= 3; i_last -= 4 )
-        if( *(uint64_t*)(l+i_last-3) )
-            break;
-    while( i_last >= 0 && l[i_last] == 0 )
-        i_last--;
+    if( !h->mb.cache.non_zero_count[x264_scan8[i_idx]] )
+    {
+        bs_write_vlc( s, x264_coeff_token[nC][0] );
+        return;
+    }
 
+    i_last = h->quantf.coeff_last[i_ctxBlockCat](l);
     i_sign = 0;
     i_total = 0;
     i_trailing = 0;
     i_total_zero = i_last + 1;
 
-    if( i_last >= 0 )
+    /* level and run and total */
+    while( i_last >= 0 )
     {
-        int idx = 0;
-
-        /* level and run and total */
-        while( i_last >= 0 )
-        {
-            int r = 0;
-            level[idx] = l[i_last];
-            while( --i_last >= 0 && l[i_last] == 0 )
-                r++;
-            run[idx++] = r;
-        }
+        int r = 0;
+        level[idx] = l[i_last];
+        while( --i_last >= 0 && l[i_last] == 0 )
+            r++;
+        run[idx++] = r;
+    }
 
-        i_total = idx;
-        i_total_zero -= idx;
+    i_total = idx;
+    i_total_zero -= idx;
 
-        i_trailing = X264_MIN(3, idx);
-        for( idx = 0; idx < i_trailing; idx++ )
+    i_trailing = X264_MIN(3, idx);
+    for( idx = 0; idx < i_trailing; idx++ )
+    {
+        if( (unsigned)(level[idx]+1) > 2 )
         {
-            if( (unsigned)(level[idx]+1) > 2 )
-            {
-                i_trailing = idx;
-                break;
-            }
-            i_sign <<= 1;
-            i_sign |= level[idx] < 0;
+            i_trailing = idx;
+            break;
         }
+        i_sign <<= 1;
+        i_sign |= level[idx] < 0;
     }
 
     /* total/trailing */
-    if( i_idx == BLOCK_INDEX_CHROMA_DC )
-        bs_write_vlc( s, x264_coeff_token[4][i_total*4+i_trailing] );
-    else
-    {
-        /* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
-        static const int ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3 };
-        int nC = x264_mb_predict_non_zero_code( h, i_idx == BLOCK_INDEX_LUMA_DC ? 0 : i_idx );
-        bs_write_vlc( s, x264_coeff_token[ct_index[nC]][i_total*4+i_trailing] );
-    }
-
-    if( i_total <= 0 )
-        return;
+    bs_write_vlc( s, x264_coeff_token[nC][i_total*4+i_trailing] );
 
     i_suffix_length = i_total > 10 && i_trailing < 3 ? 1 : 0;
     if( i_trailing > 0 )
@@ -194,7 +179,7 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_idx, int16_t *
 
     if( i_total < i_count )
     {
-        if( i_idx == BLOCK_INDEX_CHROMA_DC )
+        if( i_idx >= 25 )
             bs_write_vlc( s, x264_total_zeros_dc[i_total-1][i_total_zero] );
         else
             bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
@@ -214,7 +199,7 @@ static void cavlc_qp_delta( x264_t *h, bs_t *s )
 
     /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
     if( h->mb.i_type == I_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma)
-        && !array_non_zero(h->dct.luma16x16_dc) )
+        && !h->mb.cache.non_zero_count[x264_scan8[24]] )
     {
 #if !RDO_SKIP_BS
         h->mb.i_qp = h->mb.i_last_qp;
@@ -291,7 +276,7 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8s
             for( i4 = 0; i4 < 4; i4++ )
             {
                 h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4] );
-                block_residual_write_cavlc( h, s, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
+                block_residual_write_cavlc( h, s, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
             }
 }
 
@@ -604,14 +589,14 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
         cavlc_qp_delta( h, s );
 
         /* DC Luma */
-        block_residual_write_cavlc( h, s, BLOCK_INDEX_LUMA_DC , h->dct.luma16x16_dc, 16 );
+        block_residual_write_cavlc( h, s, DCT_LUMA_DC, 24 , h->dct.luma16x16_dc, 16 );
 
         /* AC Luma */
         if( h->mb.i_cbp_luma )
             for( i = 0; i < 16; i++ )
             {
                 h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
-                block_residual_write_cavlc( h, s, i, h->dct.luma4x4[i]+1, 15 );
+                block_residual_write_cavlc( h, s, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1, 15 );
             }
     }
     else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
@@ -622,13 +607,13 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
     if( h->mb.i_cbp_chroma )
     {
         /* Chroma DC residual present */
-        block_residual_write_cavlc( h, s, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[0], 4 );
-        block_residual_write_cavlc( h, s, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[1], 4 );
+        block_residual_write_cavlc( h, s, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4 );
+        block_residual_write_cavlc( h, s, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 );
         if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
             for( i = 16; i < 24; i++ )
             {
                 h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
-                block_residual_write_cavlc( h, s, i, h->dct.luma4x4[i]+1, 15 );
+                block_residual_write_cavlc( h, s, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15 );
             }
     }
 
@@ -678,9 +663,9 @@ static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
     {
         x264_macroblock_luma_write_cavlc( h, &s, i8, i8 );
         h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero_count( h->dct.luma4x4[16+i8] );
-        block_residual_write_cavlc( h, &s, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
+        block_residual_write_cavlc( h, &s, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
         h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero_count( h->dct.luma4x4[20+i8] );
-        block_residual_write_cavlc( h, &s, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
+        block_residual_write_cavlc( h, &s, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
         i8 += x264_pixel_size[i_pixel].h >> 3;
     }
 
@@ -694,12 +679,12 @@ static int x264_subpartition_size_cavlc( x264_t *h, int i4, int i_pixel )
     s.i_bits_encoded = 0;
     cavlc_mb_mvd( h, &s, 0, i4, 1+b_8x4 );
     h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero_count( h->dct.luma4x4[i4] );
-    block_residual_write_cavlc( h, &s, i4, h->dct.luma4x4[i4], 16 );
+    block_residual_write_cavlc( h, &s, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
     if( i_pixel != PIXEL_4x4 )
     {
         i4 += 2-b_8x4;
         h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero_count( h->dct.luma4x4[i4] );
-        block_residual_write_cavlc( h, &s, i4, h->dct.luma4x4[i4], 16 );
+        block_residual_write_cavlc( h, &s, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
     }
 
     return s.i_bits_encoded;
@@ -715,14 +700,13 @@ static int cavlc_intra4x4_pred_size( x264_t *h, int i4, int i_mode )
 
 static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
 {
-    int i4, i;
+    int i4;
     h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, 4*i8, i_mode );
+    h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8] );
     for( i4 = 0; i4 < 4; i4++ )
     {
-        for( i = 0; i < 16; i++ )
-            h->dct.luma4x4[i4+i8*4][i] = h->dct.luma8x8[i8][i4+i*4];
         h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4] );
-        block_residual_write_cavlc( h, &h->out.bs, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
+        block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
     }
     return h->out.bs.i_bits_encoded;
 }
@@ -731,7 +715,7 @@ static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
 {
     h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, i4, i_mode );
     h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero_count( h->dct.luma4x4[i4] );
-    block_residual_write_cavlc( h, &h->out.bs, i4, h->dct.luma4x4[i4], 16 );
+    block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
     return h->out.bs.i_bits_encoded;
 }
 
@@ -740,8 +724,8 @@ static int x264_i8x8_chroma_size_cavlc( x264_t *h )
     h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
     if( h->mb.i_cbp_chroma )
     {
-        block_residual_write_cavlc( h, &h->out.bs, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[0], 4 );
-        block_residual_write_cavlc( h, &h->out.bs, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[1], 4 );
+        block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4 );
+        block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 );
 
         if( h->mb.i_cbp_chroma == 2 )
         {
@@ -749,7 +733,7 @@ static int x264_i8x8_chroma_size_cavlc( x264_t *h )
             for( i = 16; i < 24; i++ )
             {
                 h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
-                block_residual_write_cavlc( h, &h->out.bs, i, h->dct.luma4x4[i]+1, 15 );
+                block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15 );
             }
         }
     }
index da490b0fe403ea30b961137289971b71bbb848a1..42c931af4dce02a3338b057395ac2798bfe9bc0d 100644 (file)
@@ -277,9 +277,12 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
         h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz;
         h->mb.i_cbp_chroma |= nz;
     }
+    h->mb.cache.non_zero_count[x264_scan8[25]] = array_non_zero( h->dct.chroma_dc[0] );
+    h->mb.cache.non_zero_count[x264_scan8[26]] = array_non_zero( h->dct.chroma_dc[1] );
     if( h->mb.i_cbp_chroma )
         h->mb.i_cbp_chroma = 2;    /* dc+ac (we can't do only ac) */
-    else if( array_non_zero( h->dct.chroma_dc ) )
+    else if( h->mb.cache.non_zero_count[x264_scan8[25]] |
+             h->mb.cache.non_zero_count[x264_scan8[26]] )
         h->mb.i_cbp_chroma = 1;    /* dc only */
 }
 
@@ -643,6 +646,7 @@ void x264_macroblock_encode( x264_t *h )
             h->mb.i_cbp_luma |= nz;
         }
         h->mb.i_cbp_luma *= 0xf;
+        h->mb.cache.non_zero_count[x264_scan8[24]] = array_non_zero( h->dct.luma16x16_dc );
     }
     else
     {
@@ -671,13 +675,14 @@ void x264_macroblock_encode( x264_t *h )
                 h->mb.i_cbp_luma |= cbp << i;
             }
         }
+        h->mb.cache.non_zero_count[x264_scan8[24]] = 0;
     }
 
     if( h->param.b_cabac )
     {
-        i_cbp_dc = ( h->mb.i_type == I_16x16 && array_non_zero( h->dct.luma16x16_dc ) )
-                 | array_non_zero( h->dct.chroma_dc[0] ) << 1
-                 | array_non_zero( h->dct.chroma_dc[1] ) << 2;
+        i_cbp_dc = h->mb.cache.non_zero_count[x264_scan8[24]]
+                 | h->mb.cache.non_zero_count[x264_scan8[25]] << 1
+                 | h->mb.cache.non_zero_count[x264_scan8[26]] << 2;
     }
 
     /* store cbp */
index 16e75e66e5f08f03569161914f3d419992428194..e810cdcca4ffb2fbc59e7671290a9e7a28cb88e0 100644 (file)
@@ -1125,7 +1125,7 @@ static int check_quant( int cpu_ref, int cpu_new )
     }
     report( "denoise dct :" );
 
-#define TEST_DECIMATE( qname, decname, block, w, ac, thresh ) \
+#define TEST_DECIMATE( decname, w, ac, thresh ) \
     if( qf_a.decname != qf_ref.decname ) \
     { \
         set_func_name( #decname ); \
@@ -1152,11 +1152,46 @@ static int check_quant( int cpu_ref, int cpu_new )
     }
 
     ok = 1;
-    TEST_DECIMATE( quant_8x8, decimate_score64, CQM_8IY, 8, 0, 6 );
-    TEST_DECIMATE( quant_4x4, decimate_score16, CQM_4IY, 4, 0, 6 );
-    TEST_DECIMATE( quant_4x4, decimate_score15, CQM_4IY, 4, 1, 7 );
+    TEST_DECIMATE( decimate_score64, 8, 0, 6 );
+    TEST_DECIMATE( decimate_score16, 4, 0, 6 );
+    TEST_DECIMATE( decimate_score15, 4, 1, 7 );
     report( "decimate_score :" );
 
+#define TEST_LAST( last, lastname, w, ac ) \
+    if( qf_a.last != qf_ref.last ) \
+    { \
+        set_func_name( #lastname ); \
+        used_asm = 1; \
+        for( i = 0; i < 100; i++ ) \
+        { \
+            int result_c, result_a, idx, nnz=0; \
+            int max = rand() & (w*w-1); \
+            memset( dct1, 0, w*w*2 ); \
+            for( idx = ac; idx < max; idx++ ) \
+                nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
+            if( !nnz ) \
+                dct1[ac] = 1; \
+            memcpy( dct2, dct1, w*w*2 ); \
+            result_c = call_c1( qf_c.last, (void*)(dct2+ac) ); \
+            result_a = call_a1( qf_a.last, (void*)(dct2+ac) ); \
+            if( result_c != result_a ) \
+            { \
+                ok = 0; \
+                fprintf( stderr, #lastname ": [FAILED]\n" ); \
+                break; \
+            } \
+            call_c2( qf_c.last, (void*)(dct2+ac) ); \
+            call_a2( qf_a.last, (void*)(dct2+ac) ); \
+        } \
+    }
+
+    ok = 1;
+    TEST_LAST( coeff_last[DCT_CHROMA_DC],  coeff_last4, 2, 0 );
+    TEST_LAST( coeff_last[  DCT_LUMA_AC], coeff_last15, 4, 1 );
+    TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 4, 0 );
+    TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 8, 0 );
+    report( "coeff_last :" );
+
     return ret;
 }