]> git.sesse.net Git - x264/commitdiff
Move deblocking/hpel into sliced threads
authorFiona Glaser <fiona@x264.com>
Thu, 15 Apr 2010 23:32:31 +0000 (16:32 -0700)
committerFiona Glaser <fiona@x264.com>
Fri, 23 Apr 2010 19:40:09 +0000 (12:40 -0700)
Instead of doing both as a separate pass, do them during the main encode.
This requires disabling deblocking between slices (disable_deblock_idc == 2).
Overall performance gain is about 11% on --preset superfast with sliced threads.
Doesn't reduce the amount of actual computation done: only better parallelizes it.

common/common.h
common/frame.c
common/macroblock.c
common/macroblock.h
encoder/encoder.c
encoder/lookahead.c

index 2fc453d56bfce4195d2598625e4a716038eab828..c63fbd9ca1bb44a0ad57beb9a0a2404f56bd06f1 100644 (file)
@@ -566,7 +566,8 @@ struct x264_t
         int16_t (*mvr[2][32])[2];           /* 16x16 mv for each possible ref */
         int8_t  *skipbp;                    /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
         int8_t  *mb_transform_size;         /* transform_size_8x8_flag of each mb */
-        uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
+        uint16_t *slice_table;              /* sh->first_mb of the slice that the indexed mb is part of
+                                             * NOTE: this will fail on resolutions above 2^16 MBs... */
 
          /* buffer for weighted versions of the reference frames */
         uint8_t *p_weight_buf[16];
@@ -763,7 +764,9 @@ struct x264_t
     ALIGNED_16( uint16_t nr_offset[2][64] );
     uint32_t        nr_count[2];
 
+    /* Buffers that are allocated per-thread even in sliced threads. */
     void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
+    uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
 
     /* CPU functions dependents */
     x264_predict_t      predict_16x16[4+3];
index fc67fbb5a64cb985a43412f23f0cd057a5d402a9..76f072d80f779d3a9dddf1de5e8b0df8fb17c584 100644 (file)
@@ -658,6 +658,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
     int stride2y  = stridey << b_interlaced;
     int strideuv  = h->fdec->i_stride[1];
     int stride2uv = strideuv << b_interlaced;
+    int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2;
     uint8_t (*nnz_backup)[16] = h->scratch_buffer;
 
     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
@@ -778,9 +779,18 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
          * i_dir == 1 -> horizontal edge */
         #define DEBLOCK_DIR(i_dir)\
         {\
-            int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
+            int i_edge = 0;\
             int i_qpn, mbn_xy, mbn_8x8, mbn_4x4;\
             ALIGNED_4( uint8_t bS[4] );  /* filtering strength */\
+            /* We don't have to consider the MBAFF case of a slice breaking in the middle\
+             * of a row because x264 doesn't support that case.  If we add support for that,\
+             * this will have to become significantly more complex. */\
+            if( i_dir == 0 && (mb_x == 0 || (!deblock_on_slice_edges &&\
+                h->mb.slice_table[mb_xy] != h->mb.slice_table[mb_xy-1])) )\
+                i_edge++;\
+            if( i_dir == 1 && (mb_y <= b_interlaced || (!deblock_on_slice_edges &&\
+                h->mb.slice_table[mb_xy] != h->mb.slice_table[mb_xy-(h->mb.i_mb_stride<<b_interlaced)])) )\
+                i_edge++;\
             if( i_edge )\
                 i_edge+= b_8x8_transform;\
             else\
index 56bbe09055aaece07abcd73f7076f21d927aca08..af37910067028deb1e32b8bc52a7bc2ed943fa8f 100644 (file)
@@ -675,7 +675,7 @@ void x264_mb_mc( x264_t *h )
     }
 }
 
-int x264_macroblock_cache_init( x264_t *h )
+int x264_macroblock_cache_allocate( x264_t *h )
 {
     int i_mb_count = h->mb.i_mb_count;
 
@@ -689,6 +689,8 @@ int x264_macroblock_cache_init( x264_t *h )
     CHECKED_MALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) );
     CHECKED_MALLOC( h->mb.skipbp, i_mb_count * sizeof(int8_t) );
     CHECKED_MALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
+    CHECKED_MALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) );
+    memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) );
 
     /* 0 -> 3 top(4), 4 -> 6 : left(3) */
     CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
@@ -755,22 +757,11 @@ int x264_macroblock_cache_init( x264_t *h )
 #undef ALIGN
     }
 
-    for( int i = 0; i <= h->param.b_interlaced; i++ )
-        for( int j = 0; j < 3; j++ )
-        {
-            /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
-            CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
-            h->mb.intra_border_backup[i][j] += 8;
-        }
-
     return 0;
 fail: return -1;
 }
-void x264_macroblock_cache_end( x264_t *h )
+void x264_macroblock_cache_free( x264_t *h )
 {
-    for( int i = 0; i <= h->param.b_interlaced; i++ )
-        for( int j = 0; j < 3; j++ )
-            x264_free( h->mb.intra_border_backup[i][j] - 8 );
     for( int i = 0; i < 2; i++ )
         for( int j = 0; j < 32; j++ )
             x264_free( h->mb.mvr[i][j] );
@@ -783,6 +774,7 @@ void x264_macroblock_cache_end( x264_t *h )
         x264_free( h->mb.mvd[0] );
         x264_free( h->mb.mvd[1] );
     }
+    x264_free( h->mb.slice_table );
     x264_free( h->mb.intra4x4_pred_mode );
     x264_free( h->mb.non_zero_count );
     x264_free( h->mb.mb_transform_size );
@@ -790,6 +782,47 @@ void x264_macroblock_cache_end( x264_t *h )
     x264_free( h->mb.cbp );
     x264_free( h->mb.qp );
 }
+
+int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
+{
+    if( !b_lookahead )
+        for( int i = 0; i <= h->param.b_interlaced; i++ )
+            for( int j = 0; j < 3; j++ )
+            {
+                /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
+                CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
+                h->intra_border_backup[i][j] += 8;
+            }
+
+    /* Allocate scratch buffer */
+    int scratch_size = 0;
+    if( !b_lookahead )
+    {
+        int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(int16_t);
+        int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
+        int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
+        int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
+            ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
+        int buf_nnz = !h->param.b_cabac * h->pps->b_transform_8x8_mode * (h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t));
+        scratch_size = X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_nnz );
+    }
+    int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
+    scratch_size = X264_MAX( scratch_size, buf_mbtree );
+    CHECKED_MALLOC( h->scratch_buffer, scratch_size );
+
+    return 0;
+fail: return -1;
+}
+
+void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
+{
+    if( !b_lookahead )
+        for( int i = 0; i <= h->param.b_interlaced; i++ )
+            for( int j = 0; j < 3; j++ )
+                x264_free( h->intra_border_backup[i][j] - 8 );
+    x264_free( h->scratch_buffer );
+}
+
 void x264_macroblock_slice_init( x264_t *h )
 {
     h->mb.mv[0] = h->fdec->mv[0];
@@ -898,8 +931,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
                            ? w * (mb_x + (mb_y&~1) * i_stride) + (mb_y&1) * i_stride
                            : w * (mb_x + mb_y * i_stride);
     const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
-    const uint8_t *intra_fdec = h->param.b_sliced_threads ? plane_fdec-i_stride2 :
-                                &h->mb.intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16>>!!i];
+    const uint8_t *intra_fdec = &h->intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16>>!!i];
     int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
     x264_frame_t **fref[2] = { h->fref0, h->fref1 };
     if( h->mb.b_interlaced )
@@ -908,10 +940,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
     h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
     h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
         h->mb.pic.p_fenc_plane[i], i_stride2, w );
-    if( mb_y > 0 )
-        memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
-    else
-        memset( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], 0, w*3/2+1 );
+    memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
     if( h->mb.b_interlaced )
         for( int j = 0; j < w; j++ )
             h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
@@ -1327,6 +1356,7 @@ void x264_macroblock_cache_save( x264_t *h )
     x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );
 
     h->mb.type[i_mb_xy] = i_mb_type;
+    h->mb.slice_table[i_mb_xy] = h->sh.i_first_mb;
     h->mb.partition[i_mb_xy] = IS_INTRA( i_mb_type ) ? D_16x16 : h->mb.i_partition;
     h->mb.i_mb_prev_xy = i_mb_xy;
 
index dc23842e1b8f7308d06146138ef1a6914b32358e..57094f054b77fddaba942e24f84203100f053543 100644 (file)
@@ -260,13 +260,18 @@ enum cabac_ctx_block_cat_e
     DCT_LUMA_8x8  = 5,
 };
 
+/* Per-frame allocation: is allocated per-thread only in frame-threads mode. */
+int  x264_macroblock_cache_allocate( x264_t *h );
+void x264_macroblock_cache_free( x264_t *h );
+
+/* Per-thread allocation: is allocated per-thread even in sliced-threads mode. */
+int  x264_macroblock_thread_allocate( x264_t *h, int b_lookahead );
+void x264_macroblock_thread_free( x264_t *h, int b_lookahead );
 
-int  x264_macroblock_cache_init( x264_t *h );
 void x264_macroblock_slice_init( x264_t *h );
 void x264_macroblock_thread_init( x264_t *h );
 void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y );
 void x264_macroblock_cache_save( x264_t *h );
-void x264_macroblock_cache_end( x264_t *h );
 
 void x264_macroblock_bipred_init( x264_t *h );
 
index 2dfe79d95502abd37324aecd27969474ad2d77d7..81b77dd166c5baa37ece5ada8ce48e57e1e9a5ad 100644 (file)
@@ -158,7 +158,7 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
     int deblock_thresh = i_qp + 2 * X264_MIN(param->i_deblocking_filter_alphac0, param->i_deblocking_filter_beta);
     /* If effective qp <= 15, deblocking would have no effect anyway */
     if( param->b_deblocking_filter && (h->mb.b_variable_qp || 15 < deblock_thresh ) )
-        sh->i_disable_deblocking_filter_idc = 0;
+        sh->i_disable_deblocking_filter_idc = param->b_sliced_threads ? 2 : 0;
     else
         sh->i_disable_deblocking_filter_idc = 1;
     sh->i_alpha_c0_offset = param->i_deblocking_filter_alphac0 << 1;
@@ -519,6 +519,16 @@ static int x264_validate_parameters( x264_t *h )
         h->param.rc.i_vbv_max_bitrate = 0;
     }
 
+    if( h->param.b_interlaced && h->param.i_slice_max_size )
+    {
+        x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
+        h->param.i_slice_max_size = 0;
+    }
+    if( h->param.b_interlaced && h->param.i_slice_max_mbs )
+    {
+        x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
+        h->param.i_slice_max_mbs = 0;
+    }
     int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced);
     if( h->param.b_sliced_threads )
         h->param.i_slice_count = x264_clip3( h->param.i_threads, 0, max_slices );
@@ -527,16 +537,6 @@ static int x264_validate_parameters( x264_t *h )
         h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices );
         h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
         h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );
-        if( h->param.b_interlaced && h->param.i_slice_max_size )
-        {
-            x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
-            h->param.i_slice_max_size = 0;
-        }
-        if( h->param.b_interlaced && h->param.i_slice_max_mbs )
-        {
-            x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
-            h->param.i_slice_max_mbs = 0;
-        }
         if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
             h->param.i_slice_count = 0;
     }
@@ -1059,23 +1059,13 @@ x264_t *x264_encoder_open( x264_param_t *param )
         CHECKED_MALLOC( h->thread[i]->out.nal, init_nal_count*sizeof(x264_nal_t) );
         h->thread[i]->out.i_nals_allocated = init_nal_count;
 
-        if( allocate_threadlocal_data && x264_macroblock_cache_init( h->thread[i] ) < 0 )
+        if( allocate_threadlocal_data && x264_macroblock_cache_allocate( h->thread[i] ) < 0 )
             goto fail;
     }
 
-    /* Allocate scratch buffer */
-    for( int i = 0; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
-    {
-        int buf_hpel = (h->fdec->i_width[0]+48) * sizeof(int16_t);
-        int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
-        int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
-        int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
-            ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
-        int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
-        int buf_nnz = !h->param.b_cabac * h->pps->b_transform_8x8_mode * (h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t));
-        int scratch_size = X264_MAX4( buf_hpel, buf_ssim, buf_tesa, X264_MAX( buf_mbtree, buf_nnz ) );
-        CHECKED_MALLOC( h->thread[i]->scratch_buffer, scratch_size );
-    }
+    for( int i = 0; i < h->param.i_threads; i++ )
+        if( x264_macroblock_thread_allocate( h->thread[i], 0 ) < 0 )
+            goto fail;
 
     if( x264_ratecontrol_new( h ) < 0 )
         goto fail;
@@ -1552,25 +1542,32 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
     h->mb.pic.i_fref[1] = h->i_ref1;
 }
 
-static void x264_fdec_filter_row( x264_t *h, int mb_y )
+static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
 {
     /* mb_y is the mb to be encoded next, not the mb to be filtered here */
     int b_hpel = h->fdec->b_kept_as_ref;
-    int b_deblock = !h->sh.i_disable_deblocking_filter_idc;
-    int b_end = mb_y == h->sps->i_mb_height;
+    int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1;
+    int b_end = mb_y == h->i_threadslice_end;
+    int b_measure_quality = 1;
     int min_y = mb_y - (1 << h->sh.b_mbaff);
-    int max_y = b_end ? h->sps->i_mb_height : mb_y;
+    int b_start = min_y == h->i_threadslice_start;
+    int max_y = b_end ? h->i_threadslice_end : mb_y;
     b_deblock &= b_hpel || h->param.psz_dump_yuv;
+    if( h->param.b_sliced_threads && b_start && min_y && !b_inloop )
+    {
+        b_deblock = 0;         /* We already deblocked on the inloop pass. */
+        b_measure_quality = 0; /* We already measured quality on the inloop pass. */
+    }
     if( mb_y & h->sh.b_mbaff )
         return;
-    if( min_y < 0 )
+    if( min_y < h->i_threadslice_start )
         return;
 
-    if( !b_end && !h->param.b_sliced_threads )
+    if( !b_end && b_inloop )
         for( int j = 0; j <= h->sh.b_mbaff; j++ )
             for( int i = 0; i < 3; i++ )
             {
-                memcpy( h->mb.intra_border_backup[j][i],
+                memcpy( h->intra_border_backup[j][i],
                         h->fdec->plane[i] + ((mb_y*16 >> !!i) + j - 1 - h->sh.b_mbaff) * h->fdec->i_stride[i],
                         h->sps->i_mb_width*16 >> !!i );
             }
@@ -1581,39 +1578,43 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
 
     if( b_hpel )
     {
-        x264_frame_expand_border( h, h->fdec, min_y, b_end );
+        int end = mb_y == h->sps->i_mb_height;
+        x264_frame_expand_border( h, h->fdec, min_y, end );
         if( h->param.analyse.i_subpel_refine )
         {
-            x264_frame_filter( h, h->fdec, min_y, b_end );
-            x264_frame_expand_border_filtered( h, h->fdec, min_y, b_end );
+            x264_frame_filter( h, h->fdec, min_y, end );
+            x264_frame_expand_border_filtered( h, h->fdec, min_y, end );
         }
     }
 
     if( h->i_thread_frames > 1 && h->fdec->b_kept_as_ref )
         x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << h->sh.b_mbaff)) );
 
-    min_y = X264_MAX( min_y*16-8, 0 );
-    max_y = b_end ? h->param.i_height : mb_y*16-8;
-
-    if( h->param.analyse.b_psnr )
-        for( int i = 0; i < 3; i++ )
-            h->stat.frame.i_ssd[i] +=
-                x264_pixel_ssd_wxh( &h->pixf,
-                    h->fdec->plane[i] + (min_y>>!!i) * h->fdec->i_stride[i], h->fdec->i_stride[i],
-                    h->fenc->plane[i] + (min_y>>!!i) * h->fenc->i_stride[i], h->fenc->i_stride[i],
-                    h->param.i_width >> !!i, (max_y-min_y) >> !!i );
+    min_y = min_y*16 - 8 * !b_start;
+    max_y = b_end ? X264_MIN( h->i_threadslice_end*16 , h->param.i_height ) : mb_y*16 - 8;
 
-    if( h->param.analyse.b_ssim )
+    if( b_measure_quality )
     {
-        x264_emms();
-        /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks,
-         * and overlap by 4 */
-        min_y += min_y == 0 ? 2 : -6;
-        h->stat.frame.f_ssim +=
-            x264_pixel_ssim_wxh( &h->pixf,
-                h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
-                h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
-                h->param.i_width-2, max_y-min_y, h->scratch_buffer );
+        if( h->param.analyse.b_psnr )
+            for( int i = 0; i < 3; i++ )
+                h->stat.frame.i_ssd[i] +=
+                    x264_pixel_ssd_wxh( &h->pixf,
+                        h->fdec->plane[i] + (min_y>>!!i) * h->fdec->i_stride[i], h->fdec->i_stride[i],
+                        h->fenc->plane[i] + (min_y>>!!i) * h->fenc->i_stride[i], h->fenc->i_stride[i],
+                        h->param.i_width >> !!i, (max_y-min_y) >> !!i );
+
+        if( h->param.analyse.b_ssim )
+        {
+            x264_emms();
+            /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks,
+             * and overlap by 4 */
+            min_y += b_start ? 2 : -6;
+            h->stat.frame.f_ssim +=
+                x264_pixel_ssim_wxh( &h->pixf,
+                    h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
+                    h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
+                    h->param.i_width-2, max_y-min_y, h->scratch_buffer );
+        }
     }
 }
 
@@ -1808,8 +1809,8 @@ static int x264_slice_write( x264_t *h )
             }
         }
 
-        if( i_mb_x == 0 && !h->mb.b_reencode_mb && !h->param.b_sliced_threads )
-            x264_fdec_filter_row( h, i_mb_y );
+        if( i_mb_x == 0 && !h->mb.b_reencode_mb )
+            x264_fdec_filter_row( h, i_mb_y, 1 );
 
         /* load cache */
         x264_macroblock_cache_load( h, i_mb_x, i_mb_y );
@@ -1971,14 +1972,13 @@ static int x264_slice_write( x264_t *h )
     if( x264_nal_end( h ) )
         return -1;
 
-    if( h->sh.i_last_mb == h->mb.i_mb_count-1 )
+    if( h->sh.i_last_mb == (h->i_threadslice_end * h->sps->i_mb_width - 1) )
     {
         h->stat.frame.i_misc_bits = bs_pos( &h->out.bs )
                                   + (h->out.i_nal*NALU_OVERHEAD * 8)
                                   - h->stat.frame.i_tex_bits
                                   - h->stat.frame.i_mv_bits;
-        if( !h->param.b_sliced_threads )
-            x264_fdec_filter_row( h, h->sps->i_mb_height );
+        x264_fdec_filter_row( h, h->i_threadslice_end, 1 );
     }
 
     return 0;
@@ -2099,9 +2099,9 @@ static int x264_threaded_slices_write( x264_t *h )
             return (intptr_t)ret;
     }
 
-    /* deblocking and hpel filtering */
-    for( int i = 0; i <= h->sps->i_mb_height; i++ )
-        x264_stack_align( x264_fdec_filter_row, h, i );
+    /* Go back and fix up the hpel on the borders between slices. */
+    for( int i = 1; i < h->param.i_threads; i++ )
+        x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 1, 0 );
 
     x264_threads_merge_ratecontrol( h );
 
@@ -2114,10 +2114,12 @@ static int x264_threaded_slices_write( x264_t *h )
             h->out.i_nal++;
             x264_nal_check_buffer( h );
         }
-        /* All entries in stat.frame are ints except for ssd/ssim,
-         * which are only calculated in the main thread. */
+        /* All entries in stat.frame are ints except for ssd/ssim. */
         for( int j = 0; j < (offsetof(x264_t,stat.frame.i_ssd) - offsetof(x264_t,stat.frame.i_mv_bits)) / sizeof(int); j++ )
             ((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
+        for( int j = 0; j < 3; j++ )
+            h->stat.frame.i_ssd[j] += t->stat.frame.i_ssd[j];
+        h->stat.frame.f_ssim += t->stat.frame.f_ssim;
     }
 
     return 0;
@@ -3072,9 +3074,9 @@ void    x264_encoder_close  ( x264_t *h )
             (*frame)->i_reference_count--;
             if( (*frame)->i_reference_count == 0 )
                 x264_frame_delete( *frame );
-            x264_macroblock_cache_end( h->thread[i] );
+            x264_macroblock_cache_free( h->thread[i] );
         }
-        x264_free( h->thread[i]->scratch_buffer );
+        x264_macroblock_thread_free( h->thread[i], 0 );
         x264_free( h->thread[i]->out.p_bitstream );
         x264_free( h->thread[i]->out.nal);
         x264_free( h->thread[i] );
index 7a0c6d3a7ffc51ef3f8dcae86ba72855d523b6bf..5e29fb5bf32a47f23eecd700b5a8c2b475db61b8 100644 (file)
@@ -148,7 +148,10 @@ int x264_lookahead_init( x264_t *h, int i_slicetype_length )
 
     x264_t *look_h = h->thread[h->param.i_threads];
     *look_h = *h;
-    if( x264_macroblock_cache_init( look_h ) )
+    if( x264_macroblock_cache_allocate( look_h ) )
+        goto fail;
+
+    if( x264_macroblock_thread_allocate( look_h, 1 ) < 0 )
         goto fail;
 
     if( x264_pthread_create( &look_h->thread_handle, NULL, (void *)x264_lookahead_thread, look_h ) )
@@ -170,8 +173,8 @@ void x264_lookahead_delete( x264_t *h )
         x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
         x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
         x264_pthread_join( h->thread[h->param.i_threads]->thread_handle, NULL );
-        x264_macroblock_cache_end( h->thread[h->param.i_threads] );
-        x264_free( h->thread[h->param.i_threads]->scratch_buffer );
+        x264_macroblock_cache_free( h->thread[h->param.i_threads] );
+        x264_macroblock_thread_free( h->thread[h->param.i_threads], 1 );
         x264_free( h->thread[h->param.i_threads] );
     }
     x264_synch_frame_list_delete( &h->lookahead->ifbuf );