From 1f0e78d8ea5b0d260f8497d4817b1962f6b0894d Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Mon, 19 Jan 2009 15:17:53 -0800
Subject: [PATCH] Eliminate support for direct_8x8_inference=0 The benefit in
 the most extreme contrived situation was at most 0.001db PSNR, at the cost of
 slower decoding. As this option was basically useless, it was a waste of code
 and prevented some other useful optimizations. Remove some unused mc code
 related to sub-8x8 partitions. Small deblocking speedup when p4x4 is used.
 Also remove unused x264_nal_decode prototype from x264.h.

---
 common/common.c     |   3 -
 common/common.h     |   2 +-
 common/frame.c      |   2 +-
 common/macroblock.c | 140 ++++++--------------------------------------
 common/macroblock.h |  30 ++--------
 encoder/encoder.c   |   3 -
 encoder/set.c       |   6 +-
 x264.c              |   6 --
 x264.h              |   7 +--
 9 files changed, 26 insertions(+), 173 deletions(-)

diff --git a/common/common.c b/common/common.c
index 6669cae8..c163e092 100644
--- a/common/common.c
+++ b/common/common.c
@@ -123,7 +123,6 @@ void    x264_param_default( x264_param_t *param )
     param->analyse.b_chroma_me = 1;
     param->analyse.i_mv_range_thread = -1;
     param->analyse.i_mv_range = -1; // set from level_idc
-    param->analyse.i_direct_8x8_inference = 1;
     param->analyse.i_chroma_qp_offset = 0;
     param->analyse.b_fast_pskip = 1;
     param->analyse.b_dct_decimate = 1;
@@ -458,8 +457,6 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
         p->analyse.b_weighted_bipred = atobool(value);
     OPT2("direct", "direct-pred")
         b_error |= parse_enum( value, x264_direct_pred_names, &p->analyse.i_direct_mv_pred );
-    OPT("direct-8x8")
-        p->analyse.i_direct_8x8_inference = atoi(value);
     OPT("chroma-qp-offset")
         p->analyse.i_chroma_qp_offset = atoi(value);
     OPT("me")
diff --git a/common/common.h b/common/common.h
index 4e1782a2..78b1efb6 100644
--- a/common/common.h
+++ b/common/common.h
@@ -440,7 +440,7 @@ struct x264_t
         /* current value */
         int     i_type;
         int     i_partition;
-        int     i_sub_partition[4];
+        DECLARE_ALIGNED_4( uint8_t i_sub_partition[4] );
         int     b_transform_8x8;
 
         int     i_cbp_luma;
diff --git a/common/frame.c b/common/frame.c
index 021242f1..5b04d682 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -623,7 +623,6 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
     const int b_interlaced = h->sh.b_mbaff;
     const int mvy_limit = 4 >> b_interlaced;
     const int qp_thresh = 15 - X264_MIN(h->sh.i_alpha_c0_offset, h->sh.i_beta_offset) - X264_MAX(0, h->param.analyse.i_chroma_qp_offset);
-    const int no_sub8x8 = !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
     int mb_x;
     int stridey   = h->fdec->i_stride[0];
     int stride2y  = stridey << b_interlaced;
@@ -641,6 +640,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
         const int b_8x8_transform = h->mb.mb_transform_size[mb_xy];
         const int i_qp = h->mb.qp[mb_xy];
         int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4;
+        int no_sub8x8 = h->mb.type[mb_xy] != P_8x8 || !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
         uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
         uint8_t *pixu = h->fdec->plane[1] +  8*mb_y*strideuv +  8*mb_x;
         uint8_t *pixv = h->fdec->plane[2] +  8*mb_y*strideuv +  8*mb_x;
diff --git a/common/macroblock.c b/common/macroblock.c
index fed6e2b4..5f5823a7 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -157,7 +157,6 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
     int i_mb_4x4 = 16 * h->mb.i_mb_stride * h->mb.i_mb_y + 4 * h->mb.i_mb_x;
     int i_mb_8x8 =  4 * h->mb.i_mb_stride * h->mb.i_mb_y + 2 * h->mb.i_mb_x;
     int i8, i4;
-    int b8x8;
     const int type_col = h->fref1[0]->mb_type[ h->mb.i_mb_xy ];
 
     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );
@@ -169,8 +168,6 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
         x264_macroblock_cache_mv(  h, 0, 0, 4, 4, 1, 0 );
         return 1;
     }
-    b8x8 = h->sps->b_direct8x8_inference ||
-           (type_col != P_8x8 && type_col != B_SKIP && type_col != B_DIRECT && type_col != B_8x8);
 
     for( i8 = 0; i8 < 4; i8++ )
     {
@@ -182,30 +179,12 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
         if( i_ref >= 0 )
         {
             const int dist_scale_factor = h->mb.dist_scale_factor[i_ref][0];
-
+            const int16_t *mv_col = h->fref1[0]->mv[0][ i_mb_4x4 + 3*x8 + 3*y8 * h->mb.i_b4_stride];
+            const int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8;
+            const int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8;
             x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
-
-            if( b8x8 )
-            {
-                const int16_t *mv_col = h->fref1[0]->mv[0][ i_mb_4x4 + 3*x8 + 3*y8 * h->mb.i_b4_stride];
-                const int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8;
-                const int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8;
-                x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, pack16to32_mask(l0x, l0y) );
-                x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) );
-            }
-            else
-            {
-                for( i4 = 0; i4 < 4; i4++ )
-                {
-                    const int x4 = i4%2 + 2*x8;
-                    const int y4 = i4/2 + 2*y8;
-                    const int16_t *mv_col = h->fref1[0]->mv[0][ i_mb_4x4 + x4 + y4 * h->mb.i_b4_stride ];
-                    const int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8;
-                    const int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8;
-                    x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, pack16to32_mask(l0x, l0y) );
-                    x264_macroblock_cache_mv( h, x4, y4, 1, 1, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) );
-                }
-            }
+            x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, pack16to32_mask(l0x, l0y) );
+            x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) );
         }
         else
         {
@@ -220,8 +199,7 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
 
     if( h->param.i_threads > 1 )
     {
-        int di = b8x8 ? 4 : 1;
-        for( i4=0; i4<16; i4+=di )
+        for( i4=0; i4<16; i4+=4 )
         {
             if( h->mb.cache.mv[0][x264_scan8[i4]][1] > h->mb.mv_max_spel[1]
              || h->mb.cache.mv[1][x264_scan8[i4]][1] > h->mb.mv_max_spel[1] )
@@ -247,8 +225,7 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
     int ref[2];
     DECLARE_ALIGNED_8( int16_t mv[2][2] );
     int i_list;
-    int i8, i4;
-    int b8x8;
+    int i8;
     const int8_t *l1ref0 = &h->fref1[0]->ref[0][ h->mb.i_b8_xy ];
     const int8_t *l1ref1 = &h->fref1[0]->ref[1][ h->mb.i_b8_xy ];
     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->fref1[0]->mv[0][ h->mb.i_b4_xy ];
@@ -310,9 +287,6 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
     if( IS_INTRA( type_col ) || (ref[0]&&ref[1]) )
         return 1;
 
-    b8x8 = h->sps->b_direct8x8_inference ||
-           (type_col != P_8x8 && type_col != B_SKIP && type_col != B_DIRECT && type_col != B_8x8);
-
     /* col_zero_flag */
     for( i8=0; i8<4; i8++ )
     {
@@ -322,32 +296,13 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
         if( l1ref0[o8] == 0 || ( l1ref0[o8] < 0 && l1ref1[o8] == 0 ) )
         {
             const int16_t (*l1mv)[2] = (l1ref0[o8] == 0) ? l1mv0 : l1mv1;
-            if( b8x8 )
+            const int16_t *mvcol = l1mv[3*x8 + 3*y8 * h->mb.i_b4_stride];
+            if( abs( mvcol[0] ) <= 1 && abs( mvcol[1] ) <= 1 )
             {
-                const int16_t *mvcol = l1mv[3*x8 + 3*y8 * h->mb.i_b4_stride];
-                if( abs( mvcol[0] ) <= 1 && abs( mvcol[1] ) <= 1 )
-                {
-                    if( ref[0] == 0 )
-                        x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, 0 );
-                    if( ref[1] == 0 )
-                        x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, 0 );
-                }
-            }
-            else
-            {
-                for( i4=0; i4<4; i4++ )
-                {
-                    const int x4 = i4%2 + 2*x8;
-                    const int y4 = i4/2 + 2*y8;
-                    const int16_t *mvcol = l1mv[x4 + y4 * h->mb.i_b4_stride];
-                    if( abs( mvcol[0] ) <= 1 && abs( mvcol[1] ) <= 1 )
-                    {
-                        if( ref[0] == 0 )
-                            x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, 0 );
-                        if( ref[1] == 0 )
-                            x264_macroblock_cache_mv( h, x4, y4, 1, 1, 1, 0 );
-                    }
-                }
+                if( ref[0] == 0 )
+                    x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, 0 );
+                if( ref[1] == 0 )
+                    x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, 0 );
             }
         }
     }
@@ -594,44 +549,13 @@ static void x264_mb_mc_direct8x8( x264_t *h, int x, int y )
 {
     const int i8 = x264_scan8[0] + x + 8*y;
 
-    /* FIXME: optimize based on current block size, not global settings? */
-    if( h->sps->b_direct8x8_inference )
-    {
-        if( h->mb.cache.ref[0][i8] >= 0 )
-            if( h->mb.cache.ref[1][i8] >= 0 )
-                x264_mb_mc_01xywh( h, x, y, 2, 2 );
-            else
-                x264_mb_mc_0xywh( h, x, y, 2, 2 );
+    if( h->mb.cache.ref[0][i8] >= 0 )
+        if( h->mb.cache.ref[1][i8] >= 0 )
+            x264_mb_mc_01xywh( h, x, y, 2, 2 );
         else
-            x264_mb_mc_1xywh( h, x, y, 2, 2 );
-    }
+            x264_mb_mc_0xywh( h, x, y, 2, 2 );
     else
-    {
-        if( h->mb.cache.ref[0][i8] >= 0 )
-        {
-            if( h->mb.cache.ref[1][i8] >= 0 )
-            {
-                x264_mb_mc_01xywh( h, x+0, y+0, 1, 1 );
-                x264_mb_mc_01xywh( h, x+1, y+0, 1, 1 );
-                x264_mb_mc_01xywh( h, x+0, y+1, 1, 1 );
-                x264_mb_mc_01xywh( h, x+1, y+1, 1, 1 );
-            }
-            else
-            {
-                x264_mb_mc_0xywh( h, x+0, y+0, 1, 1 );
-                x264_mb_mc_0xywh( h, x+1, y+0, 1, 1 );
-                x264_mb_mc_0xywh( h, x+0, y+1, 1, 1 );
-                x264_mb_mc_0xywh( h, x+1, y+1, 1, 1 );
-            }
-        }
-        else
-        {
-            x264_mb_mc_1xywh( h, x+0, y+0, 1, 1 );
-            x264_mb_mc_1xywh( h, x+1, y+0, 1, 1 );
-            x264_mb_mc_1xywh( h, x+0, y+1, 1, 1 );
-            x264_mb_mc_1xywh( h, x+1, y+1, 1, 1 );
-        }
-    }
+        x264_mb_mc_1xywh( h, x, y, 2, 2 );
 }
 
 void x264_mb_mc_8x8( x264_t *h, int i8 )
@@ -660,37 +584,9 @@ void x264_mb_mc_8x8( x264_t *h, int i8 )
         case D_L1_8x8:
             x264_mb_mc_1xywh( h, x, y, 2, 2 );
             break;
-        case D_L1_8x4:
-            x264_mb_mc_1xywh( h, x, y+0, 2, 1 );
-            x264_mb_mc_1xywh( h, x, y+1, 2, 1 );
-            break;
-        case D_L1_4x8:
-            x264_mb_mc_1xywh( h, x+0, y, 1, 2 );
-            x264_mb_mc_1xywh( h, x+1, y, 1, 2 );
-            break;
-        case D_L1_4x4:
-            x264_mb_mc_1xywh( h, x+0, y+0, 1, 1 );
-            x264_mb_mc_1xywh( h, x+1, y+0, 1, 1 );
-            x264_mb_mc_1xywh( h, x+0, y+1, 1, 1 );
-            x264_mb_mc_1xywh( h, x+1, y+1, 1, 1 );
-            break;
         case D_BI_8x8:
             x264_mb_mc_01xywh( h, x, y, 2, 2 );
             break;
-        case D_BI_8x4:
-            x264_mb_mc_01xywh( h, x, y+0, 2, 1 );
-            x264_mb_mc_01xywh( h, x, y+1, 2, 1 );
-            break;
-        case D_BI_4x8:
-            x264_mb_mc_01xywh( h, x+0, y, 1, 2 );
-            x264_mb_mc_01xywh( h, x+1, y, 1, 2 );
-            break;
-        case D_BI_4x4:
-            x264_mb_mc_01xywh( h, x+0, y+0, 1, 1 );
-            x264_mb_mc_01xywh( h, x+1, y+0, 1, 1 );
-            x264_mb_mc_01xywh( h, x+0, y+1, 1, 1 );
-            x264_mb_mc_01xywh( h, x+1, y+1, 1, 1 );
-            break;
         case D_DIRECT_8x8:
             x264_mb_mc_direct8x8( h, x, y );
             break;
diff --git a/common/macroblock.h b/common/macroblock.h
index a2d7db4e..1b0f0ca2 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -455,36 +455,14 @@ static inline int x264_mb_transform_8x8_allowed( x264_t *h )
     // large partitions are allowed
     // direct and 8x8 are conditional
     static const uint8_t partition_tab[X264_MBTYPE_MAX] = {
-        0,0,0,0,1,2,0,2,1,1,1,1,1,1,1,1,1,2,0,
+        0,0,0,0,1,2,0,1,1,1,1,1,1,1,1,1,1,1,0,
     };
-    int p, i;
 
     if( !h->pps->b_transform_8x8_mode )
         return 0;
-    p = partition_tab[h->mb.i_type];
-    if( p < 2 )
-        return p;
-    else if( h->mb.i_type == B_DIRECT )
-        return h->sps->b_direct8x8_inference;
-    else if( h->mb.i_type == P_8x8 )
-    {
-        if( !(h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
-            return 1;
-        for( i=0; i<4; i++ )
-            if( h->mb.i_sub_partition[i] != D_L0_8x8 )
-                return 0;
-        return 1;
-    }
-    else // B_8x8
-    {
-        // x264 currently doesn't use sub-8x8 B partitions, so don't check for them
-        if( h->sps->b_direct8x8_inference )
-            return 1;
-        for( i=0; i<4; i++ )
-            if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
-                return 0;
-        return 1;
-    }
+    if( h->mb.i_type != P_8x8 )
+        return partition_tab[h->mb.i_type];
+    return *(uint32_t*)h->mb.i_sub_partition == D_L0_8x8*0x01010101;
 }
 
 #endif
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 132b26de..3ef62f74 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -549,8 +549,6 @@ static int x264_validate_parameters( x264_t *h )
             h->param.analyse.i_mv_range = l->mv_range >> h->param.b_interlaced;
         else
             h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 512 >> h->param.b_interlaced);
-        if( h->param.analyse.i_direct_8x8_inference < 0 )
-            h->param.analyse.i_direct_8x8_inference = l->direct8x8;
     }
 
     if( h->param.i_threads > 1 )
@@ -595,7 +593,6 @@ static int x264_validate_parameters( x264_t *h )
     BOOLIFY( b_deblocking_filter );
     BOOLIFY( b_interlaced );
     BOOLIFY( analyse.b_transform_8x8 );
-    BOOLIFY( analyse.i_direct_8x8_inference );
     BOOLIFY( analyse.b_chroma_me );
     BOOLIFY( analyse.b_fast_pskip );
     BOOLIFY( rc.b_stat_write );
diff --git a/encoder/set.c b/encoder/set.c
index e2ec1cc9..552df80b 100644
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -133,9 +133,7 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
         sps->i_mb_height = ( sps->i_mb_height + 1 ) & ~1;
     sps->b_frame_mbs_only = ! param->b_interlaced;
     sps->b_mb_adaptive_frame_field = param->b_interlaced;
-    sps->b_direct8x8_inference = param->analyse.i_direct_8x8_inference
-                              || ! sps->b_frame_mbs_only
-                              || !(param->analyse.inter & X264_ANALYSE_PSUB8x8);
+    sps->b_direct8x8_inference = 1;
 
     sps->crop.i_left   = 0;
     sps->crop.i_top    = 0;
@@ -565,8 +563,6 @@ int x264_validate_levels( x264_t *h, int verbose )
 
     if( h->param.i_fps_den > 0 )
         CHECK( "MB rate", l->mbps, (int64_t)mbs * h->param.i_fps_num / h->param.i_fps_den );
-    if( h->sps->b_direct8x8_inference < l->direct8x8 )
-        ERROR( "direct 8x8 inference (0) < level requirement (1)\n" );
 
     /* TODO check the rest of the limits */
     return ret;
diff --git a/x264.c b/x264.c
index e1499452..04bb44ce 100644
--- a/x264.c
+++ b/x264.c
@@ -233,11 +233,6 @@ static void Help( x264_param_t *defaults, int b_longhelp )
     H0( "      --direct <string>       Direct MV prediction mode [\"%s\"]\n"
         "                                  - none, spatial, temporal, auto\n",
                                        strtable_lookup( x264_direct_pred_names, defaults->analyse.i_direct_mv_pred ) );
-    H1( "      --direct-8x8 <-1|0|1>   Direct prediction size [%d]\n"
-        "                                  -  0: 4x4\n"
-        "                                  -  1: 8x8\n"
-        "                                  - -1: smallest possible according to level\n",
-                                       defaults->analyse.i_direct_8x8_inference );
     H0( "  -w, --weightb               Weighted prediction for B-frames\n" );
     H0( "      --me <string>           Integer pixel motion estimation method [\"%s\"]\n",
                                        strtable_lookup( x264_motion_est_names, defaults->analyse.i_me_method ) );
@@ -425,7 +420,6 @@ static int  Parse( int argc, char **argv,
             { "analyse", required_argument, NULL, 0 },
             { "partitions", required_argument, NULL, 'A' },
             { "direct",  required_argument, NULL, 0 },
-            { "direct-8x8", required_argument, NULL, 0 },
             { "weightb", no_argument,       NULL, 'w' },
             { "me",      required_argument, NULL, 0 },
             { "merange", required_argument, NULL, 0 },
diff --git a/x264.h b/x264.h
index 6e123ac9..820185f9 100644
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@
 
 #include <stdarg.h>
 
-#define X264_BUILD 65
+#define X264_BUILD 66
 
 /* x264_t:
  *      opaque handler for encoder */
@@ -228,7 +228,6 @@ typedef struct x264_param_t
         int          b_transform_8x8;
         int          b_weighted_bipred; /* implicit weighting for B-frames */
         int          i_direct_mv_pred; /* spatial vs temporal mv prediction */
-        int          i_direct_8x8_inference; /* forbid 4x4 direct partitions. -1 = auto, based on level */
         int          i_chroma_qp_offset;
 
         int          i_me_method; /* motion estimation algorithm to use (X264_ME_*) */
@@ -409,10 +408,6 @@ typedef struct
  *      XXX: it currently doesn't check for overflow */
 int x264_nal_encode( void *, int *, int b_annexeb, x264_nal_t *nal );
 
-/* x264_nal_decode:
- *      decode a buffer nal into a x264_nal_t */
-int x264_nal_decode( x264_nal_t *nal, void *, int );
-
 /****************************************************************************
  * Encoder functions:
  ****************************************************************************/
-- 
2.39.2