]> git.sesse.net Git - x264/commitdiff
Sliced-threads: do hpel and deblock after returning
authorFiona Glaser <fiona@x264.com>
Fri, 24 Feb 2012 21:34:39 +0000 (13:34 -0800)
committerFiona Glaser <fiona@x264.com>
Wed, 7 Mar 2012 02:11:32 +0000 (18:11 -0800)
Lowers encoding latency around 14% in sliced threads mode with preset superfast.
Additionally, even if there is no waiting time between frames, this improves parallelism, because hpel+deblock are done during the (singlethreaded) lookahead.
For ease of debugging, dump-yuv forces all of the threads to wait and finish instead of setting b_full_recon.

common/common.h
common/deblock.c
common/frame.c
common/frame.h
common/macroblock.c
encoder/encoder.c

index 835fde529d522b7fd5c92f44c1b1010338081125..fab453e3da66a3ce500be867d99d4e67d4e50ae5 100644 (file)
@@ -470,9 +470,13 @@ struct x264_t
     x264_t          *thread[X264_THREAD_MAX+1];
     int             b_thread_active;
     int             i_thread_phase; /* which thread to use for the next frame */
+    int             i_thread_idx;   /* which thread this is */
     int             i_threadslice_start; /* first row in this thread slice */
     int             i_threadslice_end; /* row after the end of this thread slice */
+    int             i_threadslice_pass; /* which pass of encoding we are on */
     x264_threadpool_t *threadpool;
+    x264_pthread_mutex_t mutex;
+    x264_pthread_cond_t cv;
 
     /* bitstream output */
     struct
@@ -823,6 +827,9 @@ struct x264_t
             /* extra data required for mbaff in mv prediction */
             int16_t topright_mv[2][3][2];
             int8_t  topright_ref[2][3];
+
+            /* current mb deblock strength */
+            uint8_t (*deblock_strength)[8][4];
         } cache;
 
         /* */
index 51f0d7a8d22d136fe16880b0a4c98f32f597d046..bab9e5d9ffc6d63fc29d784439069d8eb8d53a0e 100644 (file)
@@ -395,7 +395,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
         int mb_xy = h->mb.i_mb_xy;
         int transform_8x8 = h->mb.mb_transform_size[h->mb.i_mb_xy];
         int intra_cur = IS_INTRA( h->mb.type[mb_xy] );
-        uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][mb_x];
+        uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][h->param.b_sliced_threads?mb_xy:mb_x];
 
         pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey  + 16*mb_x;
         pixel *pixuv = h->fdec->plane[1] + chroma_height*mb_y*strideuv + 16*mb_x;
@@ -592,7 +592,7 @@ void x264_macroblock_deblock( x264_t *h )
     if( (h->mb.i_partition == D_16x16 && !h->mb.i_cbp_luma && !intra_cur) || qp <= qp_thresh )
         return;
 
-    uint8_t (*bs)[8][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x];
+    uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength;
     if( intra_cur )
     {
         memset( &bs[0][1], 3, 3*4*sizeof(uint8_t) );
index 21d13476489fc26797c0b9d1c9d7629922a7b1e0..8a174062f512a3de6b4a7e5839fed9ffa54f7779 100644 (file)
@@ -480,9 +480,12 @@ static void ALWAYS_INLINE plane_expand_border( pixel *pix, int i_stride, int i_w
 #undef PPIXEL
 }
 
-void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
+void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y )
 {
-    int b_start = !mb_y;
+    int pad_top = mb_y == 0;
+    int pad_bot = mb_y == h->mb.i_mb_height - (1 << SLICE_MBAFF);
+    int b_start = mb_y == h->i_threadslice_start;
+    int b_end   = mb_y == h->i_threadslice_end - (1 << SLICE_MBAFF);
     if( mb_y & SLICE_MBAFF )
         return;
     for( int i = 0; i < frame->i_plane; i++ )
@@ -491,30 +494,31 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e
         int v_shift = i && CHROMA_V_SHIFT;
         int stride = frame->i_stride[i];
         int width = 16*h->mb.i_mb_width;
-        int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> v_shift;
+        int height = (pad_bot ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> v_shift;
         int padh = PADH;
         int padv = PADV >> v_shift;
         // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
         if( b_end && !b_start )
             height += 4 >> (v_shift + SLICE_MBAFF);
         pixel *pix;
+        int starty = 16*mb_y - 4*!b_start;
         if( SLICE_MBAFF )
         {
             // border samples for each field are extended separately
-            pix = frame->plane_fld[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift);
-            plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, h_shift );
-            plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, h_shift );
+            pix = frame->plane_fld[i] + (starty*stride >> v_shift);
+            plane_expand_border( pix, stride*2, width, height, padh, padv, pad_top, pad_bot, h_shift );
+            plane_expand_border( pix+stride, stride*2, width, height, padh, padv, pad_top, pad_bot, h_shift );
 
-            height = (b_end ? 16*(h->mb.i_mb_height - mb_y) : 32) >> v_shift;
+            height = (pad_bot ? 16*(h->mb.i_mb_height - mb_y) : 32) >> v_shift;
             if( b_end && !b_start )
                 height += 4 >> v_shift;
-            pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift);
-            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, h_shift );
+            pix = frame->plane[i] + (starty*stride >> v_shift);
+            plane_expand_border( pix, stride, width, height, padh, padv, pad_top, pad_bot, h_shift );
         }
         else
         {
-            pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift);
-            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, h_shift );
+            pix = frame->plane[i] + (starty*stride >> v_shift);
+            plane_expand_border( pix, stride, width, height, padh, padv, pad_top, pad_bot, h_shift );
         }
     }
 }
@@ -619,6 +623,23 @@ void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
     x264_pthread_mutex_unlock( &frame->mutex );
 }
 
+void x264_threadslice_cond_broadcast( x264_t *h, int pass )
+{
+    x264_pthread_mutex_lock( &h->mutex );
+    h->i_threadslice_pass = pass;
+    if( pass > 0 )
+        x264_pthread_cond_broadcast( &h->cv );
+    x264_pthread_mutex_unlock( &h->mutex );
+}
+
+void x264_threadslice_cond_wait( x264_t *h, int pass )
+{
+    x264_pthread_mutex_lock( &h->mutex );
+    while( h->i_threadslice_pass < pass )
+        x264_pthread_cond_wait( &h->cv, &h->mutex );
+    x264_pthread_mutex_unlock( &h->mutex );
+}
+
 /* list operators */
 
 void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
index 54415f7f47d9f5aa8cf88bb4271d66a9f8f9bf0b..31f0a3f1febdefadad98593702e8299f75042b88 100644 (file)
@@ -207,7 +207,7 @@ void          x264_frame_delete( x264_frame_t *frame );
 
 int           x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );
 
-void          x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
+void          x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y );
 void          x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
 void          x264_frame_expand_border_lowres( x264_frame_t *frame );
 void          x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane );
@@ -225,6 +225,9 @@ void          x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mba
 void          x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed );
 void          x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed );
 
+void          x264_threadslice_cond_broadcast( x264_t *h, int pass );
+void          x264_threadslice_cond_wait( x264_t *h, int pass );
+
 void          x264_frame_push( x264_frame_t **list, x264_frame_t *frame );
 x264_frame_t *x264_frame_pop( x264_frame_t **list );
 void          x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame );
index 6bb0566e80a905767abb24a4ab8b4f06e1263187..8216799c305d78ae1452938275fbdd25303473d7 100644 (file)
@@ -368,7 +368,17 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
             }
         for( int i = 0; i <= PARAM_INTERLACED; i++ )
         {
-            CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->mb.i_mb_width );
+            if( h->param.b_sliced_threads )
+            {
+                /* Only allocate the first one, and allocate it for the whole frame, because we
+                 * won't be deblocking until after the frame is fully encoded. */
+                if( h == h->thread[0] && !i )
+                    CHECKED_MALLOC( h->deblock_strength[0], sizeof(**h->deblock_strength) * h->mb.i_mb_count );
+                else
+                    h->deblock_strength[i] = h->thread[0]->deblock_strength[0];
+            }
+            else
+                CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->mb.i_mb_width );
             h->deblock_strength[1] = h->deblock_strength[i];
         }
     }
@@ -401,7 +411,8 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
     if( !b_lookahead )
     {
         for( int i = 0; i <= PARAM_INTERLACED; i++ )
-            x264_free( h->deblock_strength[i] );
+            if( !h->param.b_sliced_threads || (h == h->thread[0] && !i) )
+                x264_free( h->deblock_strength[i] );
         for( int i = 0; i < (PARAM_INTERLACED ? 5 : 2); i++ )
             for( int j = 0; j < (CHROMA444 ? 3 : 2); j++ )
                 x264_free( h->intra_border_backup[i][j] - 16 );
@@ -858,6 +869,8 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m
 
     const x264_left_table_t *left_index_table = h->mb.left_index_table;
 
+    h->mb.cache.deblock_strength = h->deblock_strength[mb_y&1][h->param.b_sliced_threads?h->mb.i_mb_xy:mb_x];
+
     /* load cache */
     if( h->mb.i_neighbour & MB_TOP )
     {
@@ -1432,7 +1445,7 @@ static void x264_macroblock_deblock_strength_mbaff( x264_t *h, uint8_t (*bs)[8][
 
 void x264_macroblock_deblock_strength( x264_t *h )
 {
-    uint8_t (*bs)[8][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x];
+    uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength;
     if( IS_INTRA( h->mb.i_type ) )
     {
         memset( bs[0][1], 3, 3*4*sizeof(uint8_t) );
index 85260d225fda33c38c5728ec4ccc53969effa756..03d05aed14792efbb6d459bc0d636611a47d0e79 100644 (file)
@@ -68,12 +68,28 @@ static double x264_ssim( double ssim )
     return -10.0 * log10( inv_ssim );
 }
 
+static int x264_threadpool_wait_all( x264_t *h )
+{
+    for( int i = 0; i < h->param.i_threads; i++ )
+        if( h->thread[i]->b_thread_active )
+        {
+            h->thread[i]->b_thread_active = 0;
+            if( (intptr_t)x264_threadpool_wait( h->threadpool, h->thread[i] ) < 0 )
+                return -1;
+        }
+    return 0;
+}
+
 static void x264_frame_dump( x264_t *h )
 {
     FILE *f = fopen( h->param.psz_dump_yuv, "r+b" );
     if( !f )
         return;
 
+    /* Wait for the threads to finish deblocking */
+    if( h->param.b_sliced_threads )
+        x264_threadpool_wait_all( h );
+
     /* Write the frame in display order */
     int frame_size = FRAME_SIZE( h->param.i_height * h->param.i_width * sizeof(pixel) );
     fseek( f, (uint64_t)h->fdec->i_frame * frame_size, SEEK_SET );
@@ -921,9 +937,6 @@ static int x264_validate_parameters( x264_t *h, int b_open )
         h->param.i_nal_hrd = X264_NAL_HRD_VBR;
     }
 
-    if( h->param.psz_dump_yuv )
-        h->param.b_full_recon = 1;
-
     /* ensure the booleans are 0 or 1 so they can be used in math */
 #define BOOLIFY(x) h->param.x = !!h->param.x
     BOOLIFY( b_cabac );
@@ -1258,8 +1271,18 @@ x264_t *x264_encoder_open( x264_param_t *param )
         goto fail;
 
     h->thread[0] = h;
-    for( int i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
-        CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
+    for( int i = 0; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
+    {
+        if( i )
+            CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
+        if( i < h->param.i_threads )
+        {
+            if( x264_pthread_mutex_init( &h->thread[i]->mutex, NULL ) )
+                goto fail;
+            if( x264_pthread_cond_init( &h->thread[i]->cv, NULL ) )
+                goto fail;
+        }
+    }
 
     for( int i = 0; i < h->param.i_threads; i++ )
     {
@@ -1354,6 +1377,11 @@ fail:
  ****************************************************************************/
 int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
 {
+    /* If the previous frame isn't done encoding, reconfiguring is probably dangerous. */
+    if( h->param.b_sliced_threads )
+        if( x264_threadpool_wait_all( h ) < 0 )
+            return -1;
+
     int rc_reconfig = 0;
     h = h->thread[h->thread[0]->i_thread_phase];
     x264_set_aspect_ratio( h, param, 0 );
@@ -1830,7 +1858,7 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
     h->mb.pic.i_fref[1] = h->i_ref[1];
 }
 
-static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
+static void x264_fdec_filter_row( x264_t *h, int mb_y, int pass )
 {
     /* mb_y is the mb to be encoded next, not the mb to be filtered here */
     int b_hpel = h->fdec->b_kept_as_ref;
@@ -1843,11 +1871,30 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
      * above each MB, as bS=4 doesn't happen for the top of interlaced mbpairs. */
     int minpix_y = min_y*16 - 4 * !b_start;
     int maxpix_y = mb_y*16 - 4 * !b_end;
-    b_deblock &= b_hpel || h->param.b_full_recon;
-    if( h->param.b_sliced_threads && b_start && min_y && !b_inloop )
+    b_deblock &= b_hpel || h->param.b_full_recon || h->param.psz_dump_yuv;
+    if( h->param.b_sliced_threads )
     {
-        b_deblock = 0;         /* We already deblocked on the inloop pass. */
-        b_measure_quality = 0; /* We already measured quality on the inloop pass. */
+        switch( pass )
+        {
+            /* During encode: only do deblock if asked for */
+            default:
+            case 0:
+                b_deblock &= h->param.b_full_recon;
+                b_hpel = 0;
+                break;
+            /* During post-encode pass: do deblock if not done yet, do hpel for all
+             * rows except those between slices. */
+            case 1:
+                b_deblock &= !h->param.b_full_recon;
+                b_hpel &= !(b_start && min_y > 0);
+                b_measure_quality = 0;
+                break;
+            /* Final pass: do the rows between slices in sequence. */
+            case 2:
+                b_deblock = 0;
+                b_measure_quality = 0;
+                break;
+        }
     }
     if( mb_y & SLICE_MBAFF )
         return;
@@ -1861,17 +1908,19 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
     /* FIXME: Prediction requires different borders for interlaced/progressive mc,
      * but the actual image data is equivalent. For now, maintain this
      * consistency by copying deblocked pixels between planes. */
-    if( PARAM_INTERLACED )
+    if( PARAM_INTERLACED && (!h->param.b_sliced_threads || pass == 1) )
         for( int p = 0; p < h->fdec->i_plane; p++ )
             for( int i = minpix_y>>(CHROMA_V_SHIFT && p); i < maxpix_y>>(CHROMA_V_SHIFT && p); i++ )
                 memcpy( h->fdec->plane_fld[p] + i*h->fdec->i_stride[p],
                         h->fdec->plane[p]     + i*h->fdec->i_stride[p],
                         h->mb.i_mb_width*16*sizeof(pixel) );
 
+    if( h->fdec->b_kept_as_ref && (!h->param.b_sliced_threads || pass == 1) )
+        x264_frame_expand_border( h, h->fdec, min_y );
     if( b_hpel )
     {
         int end = mb_y == h->mb.i_mb_height;
-        x264_frame_expand_border( h, h->fdec, min_y, end );
+        /* Can't do hpel until the previous slice is done encoding. */
         if( h->param.analyse.i_subpel_refine )
         {
             x264_frame_filter( h, h->fdec, min_y, end );
@@ -1879,7 +1928,7 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop )
         }
     }
 
-    if( SLICE_MBAFF )
+    if( SLICE_MBAFF && pass == 0 )
         for( int i = 0; i < 3; i++ )
         {
             XCHG( pixel *, h->intra_border_backup[0][i], h->intra_border_backup[3][i] );
@@ -2148,7 +2197,7 @@ static int x264_slice_write( x264_t *h )
     int orig_last_mb = h->sh.i_last_mb;
     uint8_t *last_emu_check;
     x264_bs_bak_t bs_bak[2];
-    b_deblock &= b_hpel || h->param.b_full_recon;
+    b_deblock &= b_hpel || h->param.b_full_recon || h->param.psz_dump_yuv;
     bs_realign( &h->out.bs );
 
     /* Slice */
@@ -2200,7 +2249,7 @@ static int x264_slice_write( x264_t *h )
             if( !(i_mb_y & SLICE_MBAFF) && h->param.rc.i_vbv_buffer_size )
                 x264_bitstream_backup( h, &bs_bak[1], i_skip, 1 );
             if( !h->mb.b_reencode_mb )
-                x264_fdec_filter_row( h, i_mb_y, 1 );
+                x264_fdec_filter_row( h, i_mb_y, 0 );
         }
 
         if( !(i_mb_y & SLICE_MBAFF) && back_up_bitstream )
@@ -2447,7 +2496,23 @@ reencode:
                                   + (h->out.i_nal*NALU_OVERHEAD * 8)
                                   - h->stat.frame.i_tex_bits
                                   - h->stat.frame.i_mv_bits;
-        x264_fdec_filter_row( h, h->i_threadslice_end, 1 );
+        x264_fdec_filter_row( h, h->i_threadslice_end, 0 );
+
+        if( h->param.b_sliced_threads )
+        {
+            /* Tell the main thread we're done. */
+            x264_threadslice_cond_broadcast( h, 1 );
+            /* Do hpel now */
+            for( int mb_y = h->i_threadslice_start; mb_y <= h->i_threadslice_end; mb_y++ )
+                x264_fdec_filter_row( h, mb_y, 1 );
+            x264_threadslice_cond_broadcast( h, 2 );
+            /* Do the first row of hpel, now that the previous slice is done */
+            if( h->i_thread_idx > 0 )
+            {
+                x264_threadslice_cond_wait( h->thread[h->i_thread_idx-1], 2 );
+                x264_fdec_filter_row( h, h->i_threadslice_start + (1 << SLICE_MBAFF), 2 );
+            }
+        }
     }
 
     return 0;
@@ -2488,7 +2553,7 @@ static void *x264_slices_write( x264_t *h )
 #if HAVE_VISUALIZE
     if( h->param.b_visualize )
         if( x264_visualize_init( h ) )
-            return (void *)-1;
+            goto fail;
 #endif
 
     /* init stats */
@@ -2521,7 +2586,7 @@ static void *x264_slices_write( x264_t *h )
         }
         h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, last_thread_mb );
         if( x264_stack_align( x264_slice_write, h ) )
-            return (void *)-1;
+            goto fail;
         h->sh.i_first_mb = h->sh.i_last_mb + 1;
         // if i_first_mb is not the last mb in a row then go to the next mb in MBAFF order
         if( SLICE_MBAFF && h->sh.i_first_mb % h->mb.i_mb_width )
@@ -2537,6 +2602,12 @@ static void *x264_slices_write( x264_t *h )
 #endif
 
     return (void *)0;
+
+fail:
+    /* Tell other threads we're done, so they wouldn't wait for it */
+    if( h->param.b_sliced_threads )
+        x264_threadslice_cond_broadcast( h, 2 );
+    return (void *)-1;
 }
 
 static int x264_threaded_slices_write( x264_t *h )
@@ -2561,26 +2632,19 @@ static int x264_threaded_slices_write( x264_t *h )
 
     x264_threads_distribute_ratecontrol( h );
 
-    /* dispatch */
+    /* setup */
     for( int i = 0; i < h->param.i_threads; i++ )
     {
-        x264_threadpool_run( h->threadpool, (void*)x264_slices_write, h->thread[i] );
+        h->thread[i]->i_thread_idx = i;
         h->thread[i]->b_thread_active = 1;
+        x264_threadslice_cond_broadcast( h->thread[i], 0 );
     }
+    /* dispatch */
     for( int i = 0; i < h->param.i_threads; i++ )
-    {
-        h->thread[i]->b_thread_active = 0;
-        if( (intptr_t)x264_threadpool_wait( h->threadpool, h->thread[i] ) )
-            return -1;
-    }
-
-    /* Go back and fix up the hpel on the borders between slices. */
-    for( int i = 1; i < h->param.i_threads; i++ )
-    {
-        x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 1, 0 );
-        if( SLICE_MBAFF )
-            x264_fdec_filter_row( h->thread[i], h->thread[i]->i_threadslice_start + 2, 0 );
-    }
+        x264_threadpool_run( h->threadpool, (void*)x264_slices_write, h->thread[i] );
+    /* wait */
+    for( int i = 0; i < h->param.i_threads; i++ )
+        x264_threadslice_cond_wait( h->thread[i], 1 );
 
     x264_threads_merge_ratecontrol( h );
 
@@ -2677,11 +2741,6 @@ int     x264_encoder_encode( x264_t *h,
         x264_cpu_mask_misalign_sse();
 #endif
 
-    // ok to call this before encoding any frames, since the initial values of fdec have b_kept_as_ref=0
-    if( x264_reference_update( h ) )
-        return -1;
-    h->fdec->i_lines_completed = -1;
-
     /* no data out */
     *pi_nal = 0;
     *pp_nal = NULL;
@@ -2777,6 +2836,12 @@ int     x264_encoder_encode( x264_t *h,
     /* ------------------- Get frame to be encoded ------------------------- */
     /* 4: get picture to encode */
     h->fenc = x264_frame_shift( h->frames.current );
+
+    /* If applicable, wait for previous frame reconstruction to finish */
+    if( h->param.b_sliced_threads )
+        if( x264_threadpool_wait_all( h ) < 0 )
+            return -1;
+
     if( h->i_frame == h->i_thread_frames - 1 )
         h->i_reordered_pts_delay = h->fenc->i_reordered_pts;
     if( h->fenc->param )
@@ -2786,6 +2851,11 @@ int     x264_encoder_encode( x264_t *h,
             h->fenc->param->param_free( h->fenc->param );
     }
 
+    // ok to call this before encoding any frames, since the initial values of fdec have b_kept_as_ref=0
+    if( x264_reference_update( h ) )
+        return -1;
+    h->fdec->i_lines_completed = -1;
+
     if( !IS_X264_TYPE_I( h->fenc->i_type ) )
     {
         int valid_refs_left = 0;
@@ -3117,7 +3187,7 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
 {
     char psz_message[80];
 
-    if( h->b_thread_active )
+    if( !h->param.b_sliced_threads && h->b_thread_active )
     {
         h->b_thread_active = 0;
         if( (intptr_t)x264_threadpool_wait( h->threadpool, h ) )
@@ -3381,6 +3451,8 @@ void    x264_encoder_close  ( x264_t *h )
 
     x264_lookahead_delete( h );
 
+    if( h->param.b_sliced_threads )
+        x264_threadpool_wait_all( h );
     if( h->param.i_threads > 1 )
         x264_threadpool_delete( h->threadpool );
     if( h->i_thread_frames > 1 )
@@ -3675,7 +3747,7 @@ void    x264_encoder_close  ( x264_t *h )
     x264_free( h->nal_buffer );
     x264_analyse_free_costs( h );
 
-    if( h->i_thread_frames > 1)
+    if( h->i_thread_frames > 1 )
         h = h->thread[h->i_thread_phase];
 
     /* frames */
@@ -3717,7 +3789,9 @@ void    x264_encoder_close  ( x264_t *h )
         }
         x264_macroblock_thread_free( h->thread[i], 0 );
         x264_free( h->thread[i]->out.p_bitstream );
-        x264_free( h->thread[i]->out.nal);
+        x264_free( h->thread[i]->out.nal );
+        x264_pthread_mutex_destroy( &h->thread[i]->mutex );
+        x264_pthread_cond_destroy( &h->thread[i]->cv );
         x264_free( h->thread[i] );
     }
 }