]> git.sesse.net Git - x264/commitdiff
Optimize macroblock_deblock_strength, add more early terminations
authorFiona Glaser <fiona@x264.com>
Fri, 15 Jul 2011 20:24:38 +0000 (13:24 -0700)
committerFiona Glaser <fiona@x264.com>
Fri, 22 Jul 2011 22:49:23 +0000 (15:49 -0700)
common/deblock.c
common/macroblock.c
common/macroblock.h

index bff37048f8b7451f0b2b807fe8b22ead507d5a4b..6b2212a56d38e31b75941f43b4cc02754a31dac0 100644 (file)
@@ -328,6 +328,44 @@ static ALWAYS_INLINE void deblock_edge_intra( x264_t *h, pixel *pix, int i_strid
     pf_intra( pix, i_stride, alpha, beta );
 }
 
+static ALWAYS_INLINE void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
+{
+    int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
+
+    h->mb.i_neighbour = 0;
+    h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
+    h->mb.b_interlaced = PARAM_INTERLACED && h->mb.field[h->mb.i_mb_xy];
+    h->mb.i_mb_top_y = mb_y - (1 << MB_INTERLACED);
+    h->mb.i_mb_top_xy = mb_x + h->mb.i_mb_stride*h->mb.i_mb_top_y;
+    h->mb.i_mb_left_xy[1] =
+    h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1;
+    if( SLICE_MBAFF )
+    {
+        if( mb_y&1 )
+        {
+            if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED )
+                h->mb.i_mb_left_xy[0] -= h->mb.i_mb_stride;
+        }
+        else
+        {
+            if( h->mb.i_mb_top_xy >= 0 && MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy] )
+            {
+                h->mb.i_mb_top_xy += h->mb.i_mb_stride;
+                h->mb.i_mb_top_y++;
+            }
+            if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED )
+                h->mb.i_mb_left_xy[1] += h->mb.i_mb_stride;
+        }
+    }
+
+    if( mb_x > 0 && (deblock_on_slice_edges ||
+        h->mb.slice_table[h->mb.i_mb_left_xy[0]] == h->mb.slice_table[h->mb.i_mb_xy]) )
+        h->mb.i_neighbour |= MB_LEFT;
+    if( mb_y > MB_INTERLACED && (deblock_on_slice_edges
+        || h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy]) )
+        h->mb.i_neighbour |= MB_TOP;
+}
+
 void x264_frame_deblock_row( x264_t *h, int mb_y )
 {
     int b_interlaced = SLICE_MBAFF;
index 7ac6490866596409817414d5077dd34ee7c8a6cf..a5c1d9b785d67136b7ef525556881a94a303386f 100644 (file)
@@ -1337,44 +1337,6 @@ void x264_macroblock_cache_load_interlaced( x264_t *h, int mb_x, int mb_y )
     x264_macroblock_cache_load( h, mb_x, mb_y, 1 );
 }
 
-void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y )
-{
-    int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
-
-    h->mb.i_neighbour = 0;
-    h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
-    h->mb.b_interlaced = PARAM_INTERLACED && h->mb.field[h->mb.i_mb_xy];
-    h->mb.i_mb_top_y = mb_y - (1 << MB_INTERLACED);
-    h->mb.i_mb_top_xy = mb_x + h->mb.i_mb_stride*h->mb.i_mb_top_y;
-    h->mb.i_mb_left_xy[1] =
-    h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1;
-    if( SLICE_MBAFF )
-    {
-        if( mb_y&1 )
-        {
-            if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED )
-                h->mb.i_mb_left_xy[0] -= h->mb.i_mb_stride;
-        }
-        else
-        {
-            if( h->mb.i_mb_top_xy >= 0 && MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy] )
-            {
-                h->mb.i_mb_top_xy += h->mb.i_mb_stride;
-                h->mb.i_mb_top_y++;
-            }
-            if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED )
-                h->mb.i_mb_left_xy[1] += h->mb.i_mb_stride;
-        }
-    }
-
-    if( mb_x > 0 && (deblock_on_slice_edges ||
-        h->mb.slice_table[h->mb.i_mb_left_xy[0]] == h->mb.slice_table[h->mb.i_mb_xy]) )
-        h->mb.i_neighbour |= MB_LEFT;
-    if( mb_y > MB_INTERLACED && (deblock_on_slice_edges
-        || h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy]) )
-        h->mb.i_neighbour |= MB_TOP;
-}
-
 static void x264_macroblock_deblock_strength_mbaff( x264_t *h, uint8_t (*bs)[8][4] )
 {
     if( (h->mb.i_neighbour & MB_LEFT) && h->mb.field[h->mb.i_mb_left_xy[0]] != MB_INTERLACED )
@@ -1454,77 +1416,97 @@ static void x264_macroblock_deblock_strength_mbaff( x264_t *h, uint8_t (*bs)[8][
 void x264_macroblock_deblock_strength( x264_t *h )
 {
     uint8_t (*bs)[8][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x];
-    if( IS_INTRA( h->mb.type[h->mb.i_mb_xy] ) )
+    if( IS_INTRA( h->mb.i_type ) )
     {
         memset( bs[0][1], 3, 3*4*sizeof(uint8_t) );
         memset( bs[1][1], 3, 3*4*sizeof(uint8_t) );
         return;
     }
 
+    /* Early termination: in this case, nnz guarantees all edges use strength 2.*/
+    if( h->mb.b_transform_8x8 && (h->mb.i_cbp_luma&7) == 7 && !CHROMA444 )
+    {
+        M32( bs[0][0] ) = 0x02020202;
+        M32( bs[0][2] ) = 0x02020202;
+        M32( bs[0][4] ) = 0x02020202;
+        M32( bs[1][0] ) = 0x02020202;
+        M32( bs[1][2] ) = 0x02020202;
+        M32( bs[1][4] ) = 0x02020202;
+        return;
+    }
+
+    int neighbour_changed = 0;
+    if( h->sh.i_disable_deblocking_filter_idc != 2 )
+    {
+        neighbour_changed = h->mb.i_neighbour_frame&~h->mb.i_neighbour;
+        h->mb.i_neighbour = h->mb.i_neighbour_frame;
+    }
+
+    /* MBAFF deblock uses different left neighbors from encoding */
+    if( SLICE_MBAFF && (h->mb.i_neighbour & MB_LEFT) && (h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED) )
+    {
+        h->mb.i_mb_left_xy[1] =
+        h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1;
+        if( h->mb.i_mb_y&1 )
+            h->mb.i_mb_left_xy[0] -= h->mb.i_mb_stride;
+        else
+            h->mb.i_mb_left_xy[1] += h->mb.i_mb_stride;
+    }
+
     /* If we have multiple slices and we're deblocking on slice edges, we
      * have to reload neighbour data. */
-    if( SLICE_MBAFF || (h->sh.i_first_mb && h->sh.i_disable_deblocking_filter_idc != 2) )
+    if( neighbour_changed )
     {
-        int old_neighbour = h->mb.i_neighbour;
-        int mb_x = h->mb.i_mb_x;
-        int mb_y = h->mb.i_mb_y;
-        x264_macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y );
-        int new_neighbour = h->mb.i_neighbour;
-        h->mb.i_neighbour &= ~old_neighbour;
-        if( h->mb.i_neighbour )
-        {
-            int top_y = h->mb.i_mb_top_y;
-            int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
-            int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
-            int s8x8 = h->mb.i_b8_stride;
-            int s4x4 = h->mb.i_b4_stride;
+        int top_y = h->mb.i_mb_top_y;
+        int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*h->mb.i_mb_x;
+        int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*h->mb.i_mb_x;
+        int s8x8 = h->mb.i_b8_stride;
+        int s4x4 = h->mb.i_b4_stride;
 
-            uint8_t (*nnz)[48] = h->mb.non_zero_count;
-            const x264_left_table_t *left_index_table = SLICE_MBAFF ? h->mb.left_index_table : &left_indices[3];
+        uint8_t (*nnz)[48] = h->mb.non_zero_count;
+        const x264_left_table_t *left_index_table = SLICE_MBAFF ? h->mb.left_index_table : &left_indices[3];
 
-            if( h->mb.i_neighbour & MB_TOP )
-                CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[h->mb.i_mb_top_xy][12] );
+        if( neighbour_changed & MB_TOP )
+            CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[h->mb.i_mb_top_xy][12] );
 
-            if( h->mb.i_neighbour & MB_LEFT )
+        if( neighbour_changed & MB_LEFT )
+        {
+            int *left = h->mb.i_mb_left_xy;
+            h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left[0]][left_index_table->nnz[0]];
+            h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left[0]][left_index_table->nnz[1]];
+            h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left[1]][left_index_table->nnz[2]];
+            h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left[1]][left_index_table->nnz[3]];
+        }
+
+        for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
+        {
+            int16_t (*mv)[2] = h->mb.mv[l];
+            int8_t *ref = h->mb.ref[l];
+
+            int i8 = x264_scan8[0] - 8;
+            if( neighbour_changed & MB_TOP )
             {
-                int *left = h->mb.i_mb_left_xy;
-                h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left[0]][left_index_table->nnz[0]];
-                h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left[0]][left_index_table->nnz[1]];
-                h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left[1]][left_index_table->nnz[2]];
-                h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left[1]][left_index_table->nnz[3]];
+                h->mb.cache.ref[l][i8+0] =
+                h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0];
+                h->mb.cache.ref[l][i8+2] =
+                h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1];
+                CP128( h->mb.cache.mv[l][i8], mv[top_4x4] );
             }
 
-            for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
+            i8 = x264_scan8[0] - 1;
+            if( neighbour_changed & MB_LEFT )
             {
-                int16_t (*mv)[2] = h->mb.mv[l];
-                int8_t *ref = h->mb.ref[l];
-
-                int i8 = x264_scan8[0] - 8;
-                if( h->mb.i_neighbour & MB_TOP )
-                {
-                    h->mb.cache.ref[l][i8+0] =
-                    h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0];
-                    h->mb.cache.ref[l][i8+2] =
-                    h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1];
-                    CP128( h->mb.cache.mv[l][i8], mv[top_4x4] );
-                }
+                h->mb.cache.ref[l][i8+0*8] =
+                h->mb.cache.ref[l][i8+1*8] = ref[h->mb.left_b8[0] + 1 + s8x8*left_index_table->ref[0]];
+                h->mb.cache.ref[l][i8+2*8] =
+                h->mb.cache.ref[l][i8+3*8] = ref[h->mb.left_b8[1] + 1 + s8x8*left_index_table->ref[2]];
 
-                i8 = x264_scan8[0] - 1;
-                if( h->mb.i_neighbour & MB_LEFT )
-                {
-                    h->mb.cache.ref[l][i8+0*8] =
-                    h->mb.cache.ref[l][i8+1*8] = ref[h->mb.left_b8[0] + 1 + s8x8*left_index_table->ref[0]];
-                    h->mb.cache.ref[l][i8+2*8] =
-                    h->mb.cache.ref[l][i8+3*8] = ref[h->mb.left_b8[1] + 1 + s8x8*left_index_table->ref[2]];
-
-                    CP32( h->mb.cache.mv[l][i8+0*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table->mv[0]] );
-                    CP32( h->mb.cache.mv[l][i8+1*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table->mv[1]] );
-                    CP32( h->mb.cache.mv[l][i8+2*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table->mv[2]] );
-                    CP32( h->mb.cache.mv[l][i8+3*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table->mv[3]] );
-                }
+                CP32( h->mb.cache.mv[l][i8+0*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table->mv[0]] );
+                CP32( h->mb.cache.mv[l][i8+1*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table->mv[1]] );
+                CP32( h->mb.cache.mv[l][i8+2*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table->mv[2]] );
+                CP32( h->mb.cache.mv[l][i8+3*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table->mv[3]] );
             }
         }
-        h->mb.i_neighbour = new_neighbour;
     }
 
     if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->sh.i_type == SLICE_TYPE_P )
@@ -1588,7 +1570,7 @@ void x264_macroblock_deblock_strength( x264_t *h )
             }
         }
 
-        if( h->mb.mb_transform_size[h->mb.i_mb_xy] )
+        if( h->mb.b_transform_8x8 )
         {
             int nnz0 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
             int nnz1 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 4]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 6]] );
index 278031f6bf5cfbeeb93821c1970974b975f003bb..83011879cd78ac07ce4d804dba0ac8d58c156305 100644 (file)
@@ -312,7 +312,6 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y, int b_interlaced
 void x264_macroblock_cache_load_progressive( x264_t *h, int mb_x, int mb_y );
 void x264_macroblock_cache_load_interlaced( x264_t *h, int mb_x, int mb_y );
 void x264_macroblock_deblock_strength( x264_t *h );
-void x264_macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y );
 void x264_macroblock_cache_save( x264_t *h );
 
 void x264_macroblock_bipred_init( x264_t *h );