]> git.sesse.net Git - x264/commitdiff
Bring back slice-based threading support
authorFiona Glaser <fiona@x264.com>
Mon, 7 Dec 2009 08:49:41 +0000 (00:49 -0800)
committerFiona Glaser <fiona@x264.com>
Wed, 9 Dec 2009 12:47:27 +0000 (04:47 -0800)
Enabled with --sliced-threads
Unlike normal threading, adds no encoding latency.
Less efficient than normal threading, both performance and compression-wise.
Useful for low-latency encoding environments where performance is still important, such as HD videoconferencing.
Add --tune zerolatency, which eliminates all x264 encoder-side latency (no delayed frames at all).
Some tweaks to VBV ratecontrol and lookahead (in addition to those required by sliced threading).
Commit sponsored by a media streaming company that wishes to remain anonymous.

common/common.c
common/common.h
common/macroblock.c
encoder/analyse.c
encoder/analyse.h
encoder/encoder.c
encoder/lookahead.c
encoder/ratecontrol.c
encoder/ratecontrol.h
x264.c
x264.h

index 42f759b1803b043bcb870aba5b57b476243b5b6f..b36ec0ca49a4d54d05023245e4d1c485dcea1322 100644 (file)
@@ -281,6 +281,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
         else
             p->i_threads = atoi(value);
     }
+    OPT("sliced-threads")
+        p->b_sliced_threads = atobool(value);
     OPT("sync-lookahead")
     {
         if( !strcmp(value, "auto") )
@@ -888,6 +890,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
     s += sprintf( s, " deadzone=%d,%d", p->analyse.i_luma_deadzone[0], p->analyse.i_luma_deadzone[1] );
     s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset );
     s += sprintf( s, " threads=%d", p->i_threads );
+    s += sprintf( s, " sliced_threads=%d", p->b_sliced_threads );
     if( p->i_slice_count )
         s += sprintf( s, " slices=%d", p->i_slice_count );
     if( p->i_slice_max_size )
index 0ba8fb984d8d79f7141559c85cb9d073a748c325..417ac9e8ab97522ba3fbec86ef7d22298851d7ce 100644 (file)
@@ -341,6 +341,8 @@ struct x264_t
     x264_pthread_t  thread_handle;
     int             b_thread_active;
     int             i_thread_phase; /* which thread to use for the next frame */
+    int             i_threadslice_start; /* first row in this thread slice */
+    int             i_threadslice_end; /* row after the end of this thread slice */
 
     /* bitstream output */
     struct
index 2dacf7abca370f4800ca9c14103e620a7b9e7b3a..955a8288073be132de2779a07ebc136d9dbafeff 100644 (file)
@@ -768,42 +768,6 @@ int x264_macroblock_cache_init( x264_t *h )
     memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
     memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
 
-    /* fdec:      fenc:
-     * yyyyyyy
-     * yYYYY      YYYY
-     * yYYYY      YYYY
-     * yYYYY      YYYY
-     * yYYYY      YYYY
-     * uuu vvv    UUVV
-     * uUU vVV    UUVV
-     * uUU vVV
-     */
-    h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
-    h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
-    h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
-    h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
-    h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
-    h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
-
-    h->mb.i_neighbour4[6] =
-    h->mb.i_neighbour4[9] =
-    h->mb.i_neighbour4[12] =
-    h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
-    h->mb.i_neighbour4[3] =
-    h->mb.i_neighbour4[7] =
-    h->mb.i_neighbour4[11] =
-    h->mb.i_neighbour4[13] =
-    h->mb.i_neighbour4[15] =
-    h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
-
-    int buf_hpel = (h->param.i_width+48) * sizeof(int16_t);
-    int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
-    int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
-    int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
-        ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
-    int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
-    CHECKED_MALLOC( h->scratch_buffer, X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_mbtree ) );
-
     return 0;
 fail: return -1;
 }
@@ -832,7 +796,6 @@ void x264_macroblock_cache_end( x264_t *h )
     x264_free( h->mb.skipbp );
     x264_free( h->mb.cbp );
     x264_free( h->mb.qp );
-    x264_free( h->scratch_buffer );
 }
 void x264_macroblock_slice_init( x264_t *h )
 {
@@ -871,6 +834,34 @@ void x264_macroblock_slice_init( x264_t *h )
         memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
 
     setup_inverse_delta_pocs( h );
+
+    /* fdec:      fenc:
+     * yyyyyyy
+     * yYYYY      YYYY
+     * yYYYY      YYYY
+     * yYYYY      YYYY
+     * yYYYY      YYYY
+     * uuu vvv    UUVV
+     * uUU vVV    UUVV
+     * uUU vVV
+     */
+    h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
+    h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
+    h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
+    h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
+    h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
+    h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
+
+    h->mb.i_neighbour4[6] =
+    h->mb.i_neighbour4[9] =
+    h->mb.i_neighbour4[12] =
+    h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
+    h->mb.i_neighbour4[3] =
+    h->mb.i_neighbour4[7] =
+    h->mb.i_neighbour4[11] =
+    h->mb.i_neighbour4[13] =
+    h->mb.i_neighbour4[15] =
+    h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
 }
 
 void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
@@ -899,8 +890,10 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb
     const int i_pix_offset = h->mb.b_interlaced
                            ? w * (i_mb_x + (i_mb_y&~1) * i_stride) + (i_mb_y&1) * i_stride
                            : w * (i_mb_x + i_mb_y * i_stride);
+    const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
+    const uint8_t *intra_fdec = h->param.b_sliced_threads ? plane_fdec-i_stride2 :
+                                &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
     int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
-    const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
     x264_frame_t **fref[2] = { h->fref0, h->fref1 };
     int j, k;
     if( h->mb.b_interlaced )
@@ -909,13 +902,13 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb
     h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
     h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
         h->mb.pic.p_fenc_plane[i], i_stride2, w );
-    memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
+    if( i_mb_y > 0 )
+        memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
+    else
+        memset( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], 0, w*3/2+1 );
     if( h->mb.b_interlaced || h->mb.b_reencode_mb )
-    {
-        const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
         for( j = 0; j < w; j++ )
             h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
-    }
     for( j = 0; j < h->mb.pic.i_fref[0]; j++ )
     {
         h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]];
index b861fb13282c1d6762c14e84f10a1096b3a3c97d..74f77dc0bcb67c794139bbe2674857a397aab967 100644 (file)
@@ -285,6 +285,38 @@ void x264_analyse_free_costs( x264_t *h )
     }
 }
 
+void x264_analyse_weight_frame( x264_t *h, int end )
+{
+    int j;
+    for( j=0; j<h->i_ref0; j++ )
+    {
+        if( h->sh.weight[j][0].weightfn )
+        {
+            x264_frame_t *frame = h->fref0[j];
+            int width = frame->i_width[0] + 2*PADH;
+            int i_padv = PADV << h->param.b_interlaced;
+            int offset, height;
+            uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
+            int k;
+            height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
+            offset = h->fenc->i_lines_weighted*frame->i_stride[0];
+            h->fenc->i_lines_weighted += height;
+            if( height )
+            {
+                for( k = j; k < h->i_ref0; k++ )
+                    if( h->sh.weight[k][0].weightfn )
+                    {
+                        uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
+                        x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
+                                                 src + offset, frame->i_stride[0],
+                                                 width, height, &h->sh.weight[k][0] );
+                    }
+            }
+            break;
+        }
+    }
+}
+
 /* initialize an array of lambda*nbits for all possible mvs */
 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 {
@@ -361,13 +393,13 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
         h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
         h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
         h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
-        if( h->mb.i_mb_x == 0)
+        if( h->mb.i_mb_x == 0 )
         {
             int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
             int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
             int thread_mvy_range = i_fmv_range;
 
-            if( h->param.i_threads > 1 )
+            if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
             {
                 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
                 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
@@ -387,33 +419,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
                 if( h->mb.b_interlaced )
                     thread_mvy_range >>= 1;
 
-                for( j=0; j<h->i_ref0; j++ )
-                {
-                    if( h->sh.weight[j][0].weightfn )
-                    {
-                        x264_frame_t *frame = h->fref0[j];
-                        int width = frame->i_width[0] + 2*PADH;
-                        int i_padv = PADV << h->param.b_interlaced;
-                        int offset, height;
-                        uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
-                        int k;
-                        height = X264_MIN( 16 + thread_mvy_range + pix_y + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
-                        offset = h->fenc->i_lines_weighted*frame->i_stride[0];
-                        h->fenc->i_lines_weighted += height;
-                        if( height )
-                        {
-                            for( k = j; k < h->i_ref0; k++ )
-                                if( h->sh.weight[k][0].weightfn )
-                                {
-                                    uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
-                                    x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
-                                                             src + offset, frame->i_stride[0],
-                                                             width, height, &h->sh.weight[k][0] );
-                                }
-                        }
-                        break;
-                    }
-                }
+                x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
             }
 
             h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
@@ -1247,7 +1253,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
         {
             h->mb.i_type = P_SKIP;
             x264_analyse_update_cache( h, a );
-            assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
+            assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 || h->param.b_sliced_threads );
             return;
         }
 
@@ -1263,7 +1269,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
     }
 
     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
-    assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
+    assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 || h->param.b_sliced_threads );
 
     h->mb.i_type = P_L0;
     if( a->i_mbrd )
@@ -2419,7 +2425,7 @@ void x264_macroblock_analyse( x264_t *h )
         analysis.b_try_pskip = 0;
         if( h->param.analyse.b_fast_pskip )
         {
-            if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
+            if( h->param.i_threads > 1 && !h->param.b_sliced_threads && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
                 // FIXME don't need to check this if the reference frame is done
                 {}
             else if( h->param.analyse.i_subpel_refine >= 3 )
@@ -2437,7 +2443,7 @@ void x264_macroblock_analyse( x264_t *h )
         {
             h->mb.i_type = P_SKIP;
             h->mb.i_partition = D_16x16;
-            assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
+            assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 || h->param.b_sliced_threads );
         }
         else
         {
@@ -3145,7 +3151,7 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
     }
 
 #ifndef NDEBUG
-    if( h->param.i_threads > 1 && !IS_INTRA(h->mb.i_type) )
+    if( h->param.i_threads > 1 && !h->param.b_sliced_threads && !IS_INTRA(h->mb.i_type) )
     {
         int l;
         for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
index 0e9ad7d3e574da72c93a5608dafe0f57b235bff6..7c2c22c9ec7e2ec8bd8b38a8fb39cb2cea51c392 100644 (file)
@@ -26,6 +26,7 @@
 
 int x264_analyse_init_costs( x264_t *h, int qp );
 void x264_analyse_free_costs( x264_t *h );
+void x264_analyse_weight_frame( x264_t *h, int end );
 void x264_macroblock_analyse( x264_t *h );
 void x264_slicetype_decide( x264_t *h );
 
index 7d38c1bf310e2f4c486b8152213edb597907a868..82d1f02efe7e8d449b6c00fb193e58600dec88be 100644 (file)
@@ -31,6 +31,7 @@
 #include "analyse.h"
 #include "ratecontrol.h"
 #include "macroblock.h"
+#include "me.h"
 
 #if VISUALIZE
 #include "common/visualize.h"
@@ -409,7 +410,16 @@ static int x264_validate_parameters( x264_t *h )
         x264_log( h, X264_LOG_WARNING, "not compiled with pthread support!\n");
         h->param.i_threads = 1;
 #endif
+        /* Avoid absurdly small thread slices as they can reduce performance
+         * and VBV compliance.  Capped at an arbitrary 4 rows per thread. */
+        if( h->param.b_sliced_threads )
+        {
+            int max_threads = (h->param.i_height+15)/16 / 4;
+            h->param.i_threads = X264_MIN( h->param.i_threads, max_threads );
+        }
     }
+    else
+        h->param.b_sliced_threads = 0;
 
     if( h->param.b_interlaced )
     {
@@ -497,21 +507,26 @@ static int x264_validate_parameters( x264_t *h )
     h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max );
 
     int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced);
-    h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices );
-    h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
-    h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );
-    if( h->param.b_interlaced && h->param.i_slice_max_size )
-    {
-        x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
-        h->param.i_slice_max_size = 0;
-    }
-    if( h->param.b_interlaced && h->param.i_slice_max_mbs )
+    if( h->param.b_sliced_threads )
+        h->param.i_slice_count = x264_clip3( h->param.i_threads, 0, max_slices );
+    else
     {
-        x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
-        h->param.i_slice_max_mbs = 0;
+        h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices );
+        h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
+        h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );
+        if( h->param.b_interlaced && h->param.i_slice_max_size )
+        {
+            x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
+            h->param.i_slice_max_size = 0;
+        }
+        if( h->param.b_interlaced && h->param.i_slice_max_mbs )
+        {
+            x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
+            h->param.i_slice_max_mbs = 0;
+        }
+        if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
+            h->param.i_slice_count = 0;
     }
-    if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
-        h->param.i_slice_count = 0;
 
     h->param.i_frame_reference = x264_clip3( h->param.i_frame_reference, 1, 16 );
     if( h->param.i_keyint_max <= 0 )
@@ -553,7 +568,7 @@ static int x264_validate_parameters( x264_t *h )
 #ifdef HAVE_PTHREAD
     if( h->param.i_sync_lookahead )
         h->param.i_sync_lookahead = x264_clip3( h->param.i_sync_lookahead, h->param.i_threads + h->param.i_bframe, X264_LOOKAHEAD_MAX );
-    if( h->param.rc.b_stat_read || h->param.i_threads == 1 )
+    if( h->param.rc.b_stat_read || h->param.i_threads == 1 || h->param.b_sliced_threads )
         h->param.i_sync_lookahead = 0;
 #else
     h->param.i_sync_lookahead = 0;
@@ -676,7 +691,7 @@ static int x264_validate_parameters( x264_t *h )
     if( !h->param.analyse.i_weighted_pred && h->param.rc.b_mb_tree && h->param.analyse.b_psy && !h->param.b_interlaced )
         h->param.analyse.i_weighted_pred = X264_WEIGHTP_FAKE;
 
-    if( h->param.i_threads > 1 )
+    if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
     {
         int r = h->param.analyse.i_mv_range_thread;
         int r2;
@@ -851,7 +866,8 @@ x264_t *x264_encoder_open( x264_param_t *param )
     if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size )
         h->frames.i_delay = X264_MAX( h->frames.i_delay, h->param.rc.i_lookahead );
     i_slicetype_length = h->frames.i_delay;
-    h->frames.i_delay += h->param.i_threads - 1;
+    if( !h->param.b_sliced_threads )
+        h->frames.i_delay += h->param.i_threads - 1;
     h->frames.i_delay = X264_MIN( h->frames.i_delay, X264_LOOKAHEAD_MAX );
     h->frames.i_delay += h->param.i_sync_lookahead;
 
@@ -944,23 +960,45 @@ x264_t *x264_encoder_open( x264_param_t *param )
     for( i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
         CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
 
+    if( x264_lookahead_init( h, i_slicetype_length ) )
+        goto fail;
+
     for( i = 0; i < h->param.i_threads; i++ )
     {
+        int init_nal_count = h->param.i_slice_count + 3;
+        int allocate_threadlocal_data = !h->param.b_sliced_threads || !i;
         if( i > 0 )
             *h->thread[i] = *h;
-        h->thread[i]->fdec = x264_frame_pop_unused( h, 1 );
-        if( !h->thread[i]->fdec )
-            goto fail;
+
+        if( allocate_threadlocal_data )
+        {
+            h->thread[i]->fdec = x264_frame_pop_unused( h, 1 );
+            if( !h->thread[i]->fdec )
+                goto fail;
+        }
+        else
+            h->thread[i]->fdec = h->thread[0]->fdec;
+
         CHECKED_MALLOC( h->thread[i]->out.p_bitstream, h->out.i_bitstream );
-        /* Start each thread with room for 8 NAL units; it'll realloc later if needed. */
-        CHECKED_MALLOC( h->thread[i]->out.nal, 8*sizeof(x264_nal_t) );
-        h->thread[i]->out.i_nals_allocated = 8;
-        if( x264_macroblock_cache_init( h->thread[i] ) < 0 )
+        /* Start each thread with room for init_nal_count NAL units; it'll realloc later if needed. */
+        CHECKED_MALLOC( h->thread[i]->out.nal, init_nal_count*sizeof(x264_nal_t) );
+        h->thread[i]->out.i_nals_allocated = init_nal_count;
+
+        if( allocate_threadlocal_data && x264_macroblock_cache_init( h->thread[i] ) < 0 )
             goto fail;
     }
 
-    if( x264_lookahead_init( h, i_slicetype_length ) )
-        goto fail;
+    /* Allocate scratch buffer */
+    for( i = 0; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
+    {
+        int buf_hpel = (h->param.i_width+48) * sizeof(int16_t);
+        int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
+        int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
+        int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
+            ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
+        int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
+        CHECKED_MALLOC( h->thread[i]->scratch_buffer, X264_MAX4( buf_hpel, buf_ssim, buf_tesa, buf_mbtree ) );
+    }
 
     if( x264_ratecontrol_new( h ) < 0 )
         goto fail;
@@ -1009,8 +1047,8 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
     COPY( b_deblocking_filter );
     COPY( i_deblocking_filter_alphac0 );
     COPY( i_deblocking_filter_beta );
-    COPY( analyse.intra );
     COPY( analyse.inter );
+    COPY( analyse.intra );
     COPY( analyse.i_direct_mv_pred );
     /* Scratch buffer prevents me_range from being increased for esa/tesa */
     if( h->param.analyse.i_me_method < X264_ME_ESA || param->analyse.i_me_range < h->param.analyse.i_me_range )
@@ -1056,13 +1094,9 @@ static void x264_nal_start( x264_t *h, int i_type, int i_ref_idc )
     nal->i_payload= 0;
     nal->p_payload= &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8];
 }
-static int x264_nal_end( x264_t *h )
+/* if number of allocated nals is not enough, re-allocate a larger one. */
+static int x264_nal_check_buffer( x264_t *h )
 {
-    x264_nal_t *nal = &h->out.nal[h->out.i_nal];
-    nal->i_payload = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8] - nal->p_payload;
-    h->out.i_nal++;
-
-    /* if number of allocated nals is not enough, re-allocate a larger one. */
     if( h->out.i_nal >= h->out.i_nals_allocated )
     {
         x264_nal_t *new_out = x264_malloc( sizeof(x264_nal_t) * (h->out.i_nals_allocated*2) );
@@ -1075,6 +1109,14 @@ static int x264_nal_end( x264_t *h )
     }
     return 0;
 }
+static int x264_nal_end( x264_t *h )
+{
+    x264_nal_t *nal = &h->out.nal[h->out.i_nal];
+    nal->i_payload = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8] - nal->p_payload;
+    h->out.i_nal++;
+
+    return x264_nal_check_buffer( h );
+}
 
 static int x264_encoder_encapsulate_nals( x264_t *h )
 {
@@ -1396,7 +1438,7 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
     if( min_y < 0 )
         return;
 
-    if( !b_end )
+    if( !b_end && !h->param.b_sliced_threads )
     {
         int i, j;
         for( j=0; j<=h->sh.b_mbaff; j++ )
@@ -1425,10 +1467,8 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
         }
     }
 
-    if( h->param.i_threads > 1 && h->fdec->b_kept_as_ref )
-    {
+    if( h->param.i_threads > 1 && h->fdec->b_kept_as_ref && !h->param.b_sliced_threads )
         x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << h->sh.b_mbaff)) );
-    }
 
     min_y = X264_MAX( min_y*16-8, 0 );
     max_y = b_end ? h->param.i_height : mb_y*16-8;
@@ -1463,7 +1503,7 @@ static inline int x264_reference_update( x264_t *h )
     int i, j;
     if( !h->fdec->b_kept_as_ref )
     {
-        if( h->param.i_threads > 1 )
+        if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
         {
             x264_frame_push_unused( h, h->fdec );
             h->fdec = x264_frame_pop_unused( h, 1 );
@@ -1567,8 +1607,6 @@ static inline void x264_slice_init( x264_t *h, int i_nal_type, int i_global_qp )
     {
         /* Nothing to do ? */
     }
-
-    x264_macroblock_slice_init( h );
 }
 
 static int x264_slice_write( x264_t *h )
@@ -1587,6 +1625,7 @@ static int x264_slice_write( x264_t *h )
     x264_nal_start( h, h->i_nal_type, h->i_nal_ref_idc );
 
     /* Slice header */
+    x264_macroblock_slice_init( h );
     x264_slice_header_write( &h->out.bs, &h->sh, h->i_nal_ref_idc );
     if( h->param.b_cabac )
     {
@@ -1626,7 +1665,7 @@ static int x264_slice_write( x264_t *h )
             }
         }
 
-        if( i_mb_x == 0 && !h->mb.b_reencode_mb )
+        if( i_mb_x == 0 && !h->mb.b_reencode_mb && !h->param.b_sliced_threads )
             x264_fdec_filter_row( h, i_mb_y );
 
         /* load cache */
@@ -1795,7 +1834,8 @@ static int x264_slice_write( x264_t *h )
                                   + (h->out.i_nal*NALU_OVERHEAD * 8)
                                   - h->stat.frame.i_tex_bits
                                   - h->stat.frame.i_mv_bits;
-        x264_fdec_filter_row( h, h->sps->i_mb_height );
+        if( !h->param.b_sliced_threads )
+            x264_fdec_filter_row( h, h->sps->i_mb_height );
     }
 
     return 0;
@@ -1803,11 +1843,11 @@ static int x264_slice_write( x264_t *h )
 
 static void x264_thread_sync_context( x264_t *dst, x264_t *src )
 {
-    x264_frame_t **f;
     if( dst == src )
         return;
 
     // reference counting
+    x264_frame_t **f;
     for( f = src->frames.reference; *f; f++ )
         (*f)->i_reference_count++;
     for( f = dst->frames.reference; *f; f++ )
@@ -1831,6 +1871,7 @@ static void x264_thread_sync_stat( x264_t *dst, x264_t *src )
 static void *x264_slices_write( x264_t *h )
 {
     int i_slice_num = 0;
+    int last_thread_mb = h->sh.i_last_mb;
     if( h->param.i_sync_lookahead )
         x264_lower_thread_priority( 10 );
 
@@ -1849,20 +1890,19 @@ static void *x264_slices_write( x264_t *h )
     /* init stats */
     memset( &h->stat.frame, 0, sizeof(h->stat.frame) );
     h->mb.b_reencode_mb = 0;
-    while( h->sh.i_first_mb < h->mb.i_mb_count )
+    while( h->sh.i_first_mb <= last_thread_mb )
     {
-        h->sh.i_last_mb = h->mb.i_mb_count - 1;
+        h->sh.i_last_mb = last_thread_mb;
         if( h->param.i_slice_max_mbs )
             h->sh.i_last_mb = h->sh.i_first_mb + h->param.i_slice_max_mbs - 1;
-        else if( h->param.i_slice_count )
+        else if( h->param.i_slice_count && !h->param.b_sliced_threads )
         {
-            x264_emms();
-            i_slice_num++;
-            double height = h->sps->i_mb_height >> h->param.b_interlaced;
+            int height = h->sps->i_mb_height >> h->param.b_interlaced;
             int width = h->sps->i_mb_width << h->param.b_interlaced;
-            h->sh.i_last_mb = (int)(height * i_slice_num / h->param.i_slice_count + 0.5) * width - 1;
+            i_slice_num++;
+            h->sh.i_last_mb = (height * i_slice_num + h->param.i_slice_count/2) / h->param.i_slice_count * width - 1;
         }
-        h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, h->mb.i_mb_count - 1 );
+        h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, last_thread_mb );
         if( x264_stack_align( x264_slice_write, h ) )
             return (void *)-1;
         h->sh.i_first_mb = h->sh.i_last_mb + 1;
@@ -1879,6 +1919,65 @@ static void *x264_slices_write( x264_t *h )
     return (void *)0;
 }
 
+static int x264_threaded_slices_write( x264_t *h )
+{
+    int i, j;
+    void *ret = NULL;
+    /* set first/last mb and sync contexts */
+    for( i = 0; i < h->param.i_threads; i++ )
+    {
+        x264_t *t = h->thread[i];
+        if( i )
+        {
+            t->param = h->param;
+            memcpy( &t->i_frame, &h->i_frame, offsetof(x264_t, rc) - offsetof(x264_t, i_frame) );
+        }
+        int height = h->sps->i_mb_height >> h->param.b_interlaced;
+        t->i_threadslice_start = ((height *  i    + h->param.i_slice_count/2) / h->param.i_threads) << h->param.b_interlaced;
+        t->i_threadslice_end   = ((height * (i+1) + h->param.i_slice_count/2) / h->param.i_threads) << h->param.b_interlaced;
+        t->sh.i_first_mb = t->i_threadslice_start * h->sps->i_mb_width;
+        t->sh.i_last_mb  =   t->i_threadslice_end * h->sps->i_mb_width - 1;
+    }
+
+    x264_analyse_weight_frame( h, h->sps->i_mb_height*16 + 16 );
+
+    x264_threads_distribute_ratecontrol( h );
+
+    /* dispatch */
+    for( i = 0; i < h->param.i_threads; i++ )
+        if( x264_pthread_create( &h->thread[i]->thread_handle, NULL, (void*)x264_slices_write, (void*)h->thread[i] ) )
+            return -1;
+    for( i = 0; i < h->param.i_threads; i++ )
+    {
+        x264_pthread_join( h->thread[i]->thread_handle, &ret );
+        if( (intptr_t)ret )
+            return (intptr_t)ret;
+    }
+
+    /* deblocking and hpel filtering */
+    for( i = 0; i <= h->sps->i_mb_height; i++ )
+        x264_fdec_filter_row( h, i );
+
+    for( i = 1; i < h->param.i_threads; i++ )
+    {
+        x264_t *t = h->thread[i];
+        for( j = 0; j < t->out.i_nal; j++ )
+        {
+            h->out.nal[h->out.i_nal] = t->out.nal[j];
+            h->out.i_nal++;
+            x264_nal_check_buffer( h );
+        }
+        /* All entries in stat.frame are ints except for ssd/ssim,
+         * which are only calculated in the main thread. */
+        for( j = 0; j < (offsetof(x264_t,stat.frame.i_ssd) - offsetof(x264_t,stat.frame.i_mv_bits)) / sizeof(int); j++ )
+            ((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
+    }
+
+    x264_threads_merge_ratecontrol( h );
+
+    return 0;
+}
+
 /****************************************************************************
  * x264_encoder_encode:
  *  XXX: i_poc   : is the poc of the current given picture
@@ -1898,12 +1997,9 @@ int     x264_encoder_encode( x264_t *h,
                              x264_picture_t *pic_out )
 {
     x264_t *thread_current, *thread_prev, *thread_oldest;
-    int     i_nal_type;
-    int     i_nal_ref_idc;
+    int i_nal_type, i_nal_ref_idc, i_global_qp, i;
 
-    int   i_global_qp;
-
-    if( h->param.i_threads > 1)
+    if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
     {
         thread_prev    = h->thread[ h->i_thread_phase ];
         h->i_thread_phase = (h->i_thread_phase + 1) % h->param.i_threads;
@@ -1964,7 +2060,7 @@ int     x264_encoder_encode( x264_t *h,
         /* 2: Place the frame into the queue for its slice type decision */
         x264_lookahead_put_frame( h, fenc );
 
-        if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads )
+        if( h->frames.i_input <= h->frames.i_delay + (h->param.b_sliced_threads ? 0 : 1 - h->param.i_threads) )
         {
             /* Nothing yet to encode, waiting for filling of buffers */
             pic_out->i_type = X264_TYPE_AUTO;
@@ -2061,8 +2157,19 @@ int     x264_encoder_encode( x264_t *h,
 
     /* ---------------------- Write the bitstream -------------------------- */
     /* Init bitstream context */
-    h->out.i_nal = 0;
-    bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
+    if( h->param.b_sliced_threads )
+    {
+        for( i = 0; i < h->param.i_threads; i++ )
+        {
+            bs_init( &h->thread[i]->out.bs, h->thread[i]->out.p_bitstream, h->thread[i]->out.i_bitstream );
+            h->thread[i]->out.i_nal = 0;
+        }
+    }
+    else
+    {
+        bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
+        h->out.i_nal = 0;
+    }
 
     if( h->param.b_aud )
     {
@@ -2145,12 +2252,19 @@ int     x264_encoder_encode( x264_t *h,
         h->i_frame_num++;
 
     /* Write frame */
-    if( h->param.i_threads > 1 )
+    h->i_threadslice_start = 0;
+    h->i_threadslice_end = h->sps->i_mb_height;
+    if( !h->param.b_sliced_threads && h->param.i_threads > 1 )
     {
         if( x264_pthread_create( &h->thread_handle, NULL, (void*)x264_slices_write, h ) )
             return -1;
         h->b_thread_active = 1;
     }
+    else if( h->param.b_sliced_threads )
+    {
+        if( x264_threaded_slices_write( h ) )
+            return -1;
+    }
     else
         if( (intptr_t)x264_slices_write( h ) )
             return -1;
@@ -2375,7 +2489,7 @@ void    x264_encoder_close  ( x264_t *h )
 
     x264_lookahead_delete( h );
 
-    for( i=0; i<h->param.i_threads; i++ )
+    for( i = 0; i < h->param.i_threads; i++ )
     {
         // don't strictly have to wait for the other threads, but it's simpler than canceling them
         if( h->thread[i]->b_thread_active )
@@ -2386,7 +2500,7 @@ void    x264_encoder_close  ( x264_t *h )
         }
     }
 
-    if( h->param.i_threads > 1 )
+    if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
     {
         x264_t *thread_prev;
 
@@ -2659,20 +2773,23 @@ void    x264_encoder_close  ( x264_t *h )
     {
         x264_frame_t **frame;
 
-        for( frame = h->thread[i]->frames.reference; *frame; frame++ )
+        if( !h->param.b_sliced_threads || i == 0 )
         {
+            for( frame = h->thread[i]->frames.reference; *frame; frame++ )
+            {
+                assert( (*frame)->i_reference_count > 0 );
+                (*frame)->i_reference_count--;
+                if( (*frame)->i_reference_count == 0 )
+                    x264_frame_delete( *frame );
+            }
+            frame = &h->thread[i]->fdec;
             assert( (*frame)->i_reference_count > 0 );
             (*frame)->i_reference_count--;
             if( (*frame)->i_reference_count == 0 )
                 x264_frame_delete( *frame );
+            x264_macroblock_cache_end( h->thread[i] );
         }
-        frame = &h->thread[i]->fdec;
-        assert( (*frame)->i_reference_count > 0 );
-        (*frame)->i_reference_count--;
-        if( (*frame)->i_reference_count == 0 )
-            x264_frame_delete( *frame );
-
-        x264_macroblock_cache_end( h->thread[i] );
+        x264_free( h->thread[i]->scratch_buffer );
         x264_free( h->thread[i]->out.p_bitstream );
         x264_free( h->thread[i]->out.nal);
         x264_free( h->thread[i] );
index f2bed16ab6a471e2853baa0d946463764f1c6ab1..f33b167fed58f386de80db58cb22a3dd380e3daa 100644 (file)
@@ -172,6 +172,7 @@ void x264_lookahead_delete( x264_t *h )
         x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
         x264_pthread_join( h->thread[h->param.i_threads]->thread_handle, NULL );
         x264_macroblock_cache_end( h->thread[h->param.i_threads] );
+        x264_free( h->thread[h->param.i_threads]->scratch_buffer );
         x264_free( h->thread[h->param.i_threads] );
     }
     x264_synch_frame_list_delete( &h->lookahead->ifbuf );
index 9a027db2633a87ade6ab73f62de42782abc92cdf..ef23f2323d22a3d71a4e80a510f01d673b085658 100644 (file)
@@ -128,11 +128,12 @@ struct x264_ratecontrol_t
     double lmin[5];             /* min qscale by frame type */
     double lmax[5];
     double lstep;               /* max change (multiply) in qscale per frame */
-    uint16_t *qp_buffer; /* Global buffer for converting MB-tree quantizer data. */
+    uint16_t *qp_buffer;        /* Global buffer for converting MB-tree quantizer data. */
 
     /* MBRC stuff */
     double frame_size_estimated;
     double frame_size_planned;
+    double slice_size_planned;
     predictor_t (*row_pred)[2];
     predictor_t row_preds[5][2];
     predictor_t *pred_b_from_p; /* predict B-frame size from P-frame satd */
@@ -1121,7 +1122,7 @@ static double row_bits_so_far( x264_t *h, int y )
 {
     int i;
     double bits = 0;
-    for( i = 0; i <= y; i++ )
+    for( i = h->i_threadslice_start; i <= y; i++ )
         bits += h->fdec->i_row_bits[i];
     return bits;
 }
@@ -1130,7 +1131,7 @@ static double predict_row_size_sum( x264_t *h, int y, int qp )
 {
     int i;
     double bits = row_bits_so_far(h, y);
-    for( i = y+1; i < h->sps->i_mb_height; i++ )
+    for( i = y+1; i < h->i_threadslice_end; i++ )
         bits += predict_row_size( h, i, qp );
     return bits;
 }
@@ -1161,7 +1162,7 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
     }
 
     /* tweak quality based on difference from predicted size */
-    if( y < h->sps->i_mb_height-1 )
+    if( y < h->i_threadslice_end-1 )
     {
         int prev_row_qp = h->fdec->i_row_qp[y];
         int i_qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, h->param.rc.i_qp_max );
@@ -1174,16 +1175,20 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
             rc->qpm = X264_MAX( rc->qpm, i_qp_min );
         }
 
-        int b0 = predict_row_size_sum( h, y, rc->qpm );
-        int b1 = b0;
         float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned;
-
+        float slice_size_planned = h->param.b_sliced_threads ? rc->slice_size_planned : rc->frame_size_planned;
+        float size_of_other_slices = rc->frame_size_planned - slice_size_planned;
         /* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */
         float rc_tol = buffer_left_planned / h->param.i_threads * rc->rate_tolerance;
+        int b1 = predict_row_size_sum( h, y, rc->qpm );
+
+        /* Assume that if this slice has become larger than expected,
+         * the other slices will have gotten equally larger. */
+        b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
 
         /* Don't modify the row QPs until a sufficent amount of the bits of the frame have been processed, in case a flat */
         /* area at the top of the frame was measured inaccurately. */
-        if( row_bits_so_far(h,y) < 0.05 * rc->frame_size_planned )
+        if( row_bits_so_far(h,y) < 0.05 * (rc->frame_size_planned-size_of_other_slices) )
             return;
 
         if( h->sh.i_type != SLICE_TYPE_I )
@@ -1199,6 +1204,7 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
         {
             rc->qpm ++;
             b1 = predict_row_size_sum( h, y, rc->qpm );
+            b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
         }
 
         while( rc->qpm > i_qp_min
@@ -1208,6 +1214,7 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
         {
             rc->qpm --;
             b1 = predict_row_size_sum( h, y, rc->qpm );
+            b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
         }
 
         /* avoid VBV underflow */
@@ -1216,6 +1223,7 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
         {
             rc->qpm ++;
             b1 = predict_row_size_sum( h, y, rc->qpm );
+            b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
         }
 
         x264_ratecontrol_set_estimated_size(h, b1);
@@ -1568,7 +1576,7 @@ static void update_vbv_plan( x264_t *h, int overhead )
 {
     x264_ratecontrol_t *rcc = h->rc;
     rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final - overhead;
-    if( h->param.i_threads > 1 )
+    if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
     {
         int j = h->rc - h->thread[0]->rc;
         int i;
@@ -1612,7 +1620,7 @@ static double clip_qscale( x264_t *h, int pict_type, double q )
             {
                 double frame_q[3];
                 double cur_bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
-                double buffer_fill_cur = rcc->buffer_fill - cur_bits + rcc->buffer_rate;
+                double buffer_fill_cur = rcc->buffer_fill - cur_bits;
                 double target_fill;
                 frame_q[0] = h->sh.i_type == SLICE_TYPE_I ? q * h->param.rc.f_ip_factor : q;
                 frame_q[1] = frame_q[0] * h->param.rc.f_pb_factor;
@@ -1621,13 +1629,14 @@ static double clip_qscale( x264_t *h, int pict_type, double q )
                 /* Loop over the planned future frames. */
                 for( j = 0; buffer_fill_cur >= 0 && buffer_fill_cur <= rcc->buffer_size; j++ )
                 {
+                    buffer_fill_cur += rcc->buffer_rate;
                     int i_type = h->fenc->i_planned_type[j];
                     int i_satd = h->fenc->i_planned_satd[j];
                     if( i_type == X264_TYPE_AUTO )
                         break;
                     i_type = IS_X264_TYPE_I( i_type ) ? SLICE_TYPE_I : IS_X264_TYPE_B( i_type ) ? SLICE_TYPE_B : SLICE_TYPE_P;
                     cur_bits = predict_size( &rcc->pred[i_type], frame_q[i_type], i_satd );
-                    buffer_fill_cur = buffer_fill_cur - cur_bits + rcc->buffer_rate;
+                    buffer_fill_cur -= cur_bits;
                 }
                 /* Try to get to get the buffer at least 50% filled, but don't set an impossible goal. */
                 target_fill = X264_MIN( rcc->buffer_fill + j * rcc->buffer_rate * 0.5, rcc->buffer_size * 0.5 );
@@ -1793,7 +1802,7 @@ static float rate_estimate_qscale( x264_t *h )
 
             if( rcc->b_vbv )
             {
-                if( h->param.i_threads > 1 )
+                if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
                 {
                     int j = h->rc - h->thread[0]->rc;
                     int i;
@@ -1945,6 +1954,58 @@ static float rate_estimate_qscale( x264_t *h )
     }
 }
 
+void x264_threads_distribute_ratecontrol( x264_t *h )
+{
+    int i, row, totalsize = 0;
+    if( h->rc->b_vbv )
+        for( row = 0; row < h->sps->i_mb_height; row++ )
+            totalsize += h->fdec->i_row_satd[row];
+    for( i = 0; i < h->param.i_threads; i++ )
+    {
+        x264_ratecontrol_t *t = h->thread[i]->rc;
+        x264_ratecontrol_t *rc = h->rc;
+        memcpy( t, rc, sizeof( x264_ratecontrol_t ) );
+        /* Calculate the planned slice size. */
+        if( h->rc->b_vbv && rc->frame_size_planned )
+        {
+            int size = 0;
+            for( row = h->i_threadslice_start; row < h->i_threadslice_end; row++ )
+                size += h->fdec->i_row_satd[row];
+            t->slice_size_planned = size * rc->frame_size_planned / totalsize;
+        }
+        else
+            t->slice_size_planned = 0;
+    }
+}
+
+void x264_threads_merge_ratecontrol( x264_t *h )
+{
+    int i, j, k;
+    x264_ratecontrol_t *rc = h->rc;
+    x264_emms();
+
+    for( i = 1; i < h->param.i_threads; i++ )
+    {
+        x264_ratecontrol_t *t = h->thread[i]->rc;
+        rc->qpa_rc += t->qpa_rc;
+        rc->qpa_aq += t->qpa_aq;
+        for( j = 0; j < 5; j++ )
+            for( k = 0; k < 2; k++ )
+            {
+                rc->row_preds[j][k].coeff += t->row_preds[j][k].coeff;
+                rc->row_preds[j][k].offset += t->row_preds[j][k].offset;
+                rc->row_preds[j][k].count += t->row_preds[j][k].count;
+            }
+    }
+    for( j = 0; j < 5; j++ )
+        for( k = 0; k < 2; k++ )
+        {
+            rc->row_preds[j][k].coeff /= h->param.i_threads;
+            rc->row_preds[j][k].offset /= h->param.i_threads;
+            rc->row_preds[j][k].count /= h->param.i_threads;
+        }
+}
+
 void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
 {
     if( cur != prev )
index b9d552b5c78af86528a582ade4c64f447dea5e8d..5a8d088522f284c7fd69c35da355ca10020b0520 100644 (file)
@@ -43,6 +43,8 @@ void x264_ratecontrol_set_estimated_size( x264_t *, int bits );
 int  x264_ratecontrol_get_estimated_size( x264_t const *);
 int  x264_rc_analyse_slice( x264_t *h );
 int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
+void x264_threads_distribute_ratecontrol( x264_t *h );
+void x264_threads_merge_ratecontrol( x264_t *h );
 
 #endif
 
diff --git a/x264.c b/x264.c
index a2901006016b4be0ea7f26ff3104124043995e77..190dba0bcd506b0f07ca87da02f052e4c6af63ab 100644 (file)
--- a/x264.c
+++ b/x264.c
@@ -157,20 +157,20 @@ static void Help( x264_param_t *defaults, int longhelp )
     H0( "Example usage:\n" );
     H0( "\n" );
     H0( "      Constant quality mode:\n" );
-    H0( "            x264 --crf 24 -o output input\n" );
+    H0( "            x264 --crf 24 -o <output> <input>\n" );
     H0( "\n" );
     H0( "      Two-pass with a bitrate of 1000kbps:\n" );
-    H0( "            x264 --pass 1 --bitrate 1000 -o output input\n" );
-    H0( "            x264 --pass 2 --bitrate 1000 -o output input\n" );
+    H0( "            x264 --pass 1 --bitrate 1000 -o <output> <input>\n" );
+    H0( "            x264 --pass 2 --bitrate 1000 -o <output> <input>\n" );
     H0( "\n" );
     H0( "      Lossless:\n" );
-    H0( "            x264 --crf 0 -o output input\n" );
+    H0( "            x264 --crf 0 -o <output> <input>\n" );
     H0( "\n" );
     H0( "      Maximum PSNR at the cost of speed and visual quality:\n" );
-    H0( "            x264 --preset placebo --tune psnr -o output input\n" );
+    H0( "            x264 --preset placebo --tune psnr -o <output> <input>\n" );
     H0( "\n" );
     H0( "      Constant bitrate at 1000kbps with a 2 second-buffer:\n");
-    H0( "            x264 --vbv-bufsize 2000 --bitrate 1000 -o output input\n" );
+    H0( "            x264 --vbv-bufsize 2000 --bitrate 1000 -o <output> <input>\n" );
     H0( "\n" );
     H0( "Presets:\n" );
     H0( "\n" );
@@ -245,12 +245,16 @@ static void Help( x264_param_t *defaults, int longhelp )
         "                                  - fastdecode:\n"
         "                                    --no-cabac --no-deblock --no-weightb\n"
         "                                    --weightp 0\n"
+        "                                  - zerolatency:\n"
+        "                                    --bframes 0 --rc-lookahead 0\n"
+        "                                    --sync-lookahead 0 --sliced-threads\n"
         "                                  - touhou:\n"
         "                                    --aq-strength 1.3 --deblock -1:-1\n"
         "                                    --partitions {p4x4 if p8x8 set}\n"
         "                                    --psy-rd <unset>:0.2\n"
         "                                    --ref {Double if >1 else 1}\n" );
-    else H0( "                                  - film,animation,grain,psnr,ssim,fastdecode\n" );
+    else H0( "                                  - film,animation,grain,psnr,ssim\n"
+             "                                  - fastdecode,zerolatency\n" );
     H1( "      --slow-firstpass        Don't use faster settings with --pass 1\n" );
     H0( "\n" );
     H0( "Frame-type options:\n" );
@@ -444,6 +448,7 @@ static void Help( x264_param_t *defaults, int longhelp )
     H1( "      --psnr                  Enable PSNR computation\n" );
     H1( "      --ssim                  Enable SSIM computation\n" );
     H1( "      --threads <integer>     Force a specific number of threads\n" );
+    H2( "      --sliced-threads        Low-latency but lower-efficiency threading\n" );
     H2( "      --thread-input          Run Avisynth in its own thread\n" );
     H2( "      --sync-lookahead <integer> Number of buffer frames for threaded lookahead\n" );
     H2( "      --non-deterministic     Slightly improve quality of SMP, at the cost of repeatability\n" );
@@ -563,6 +568,8 @@ static struct option long_options[] =
     { "zones",       required_argument, NULL, 0 },
     { "qpfile",      required_argument, NULL, OPT_QPFILE },
     { "threads",     required_argument, NULL, 0 },
+    { "sliced-threads",    no_argument, NULL, 0 },
+    { "no-sliced-threads", no_argument, NULL, 0 },
     { "slice-max-size",    required_argument, NULL, 0 },
     { "slice-max-mbs",     required_argument, NULL, 0 },
     { "slices",            required_argument, NULL, 0 },
@@ -878,6 +885,13 @@ static int  Parse( int argc, char **argv,
                 param->analyse.b_weighted_bipred = 0;
                 param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
             }
+            else if( !strcasecmp( optarg, "zerolatency" ) )
+            {
+                param->rc.i_lookahead = 0;
+                param->i_sync_lookahead = 0;
+                param->i_bframe = 0;
+                param->b_sliced_threads = 1;
+            }
             else if( !strcasecmp( optarg, "touhou" ) )
             {
                 param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1;
diff --git a/x264.h b/x264.h
index b8e31136cdf2f4bb505459df77098843b9077d58..f1d820235d3442d5bff9d1cc198f59f7bbc2682a 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@
 
 #include <stdarg.h>
 
-#define X264_BUILD 79
+#define X264_BUILD 80
 
 /* x264_t:
  *      opaque handler for encoder */
@@ -165,6 +165,7 @@ typedef struct x264_param_t
     /* CPU flags */
     unsigned int cpu;
     int         i_threads;       /* encode multiple frames in parallel */
+    int         b_sliced_threads;  /* Whether to use slice-based threading. */
     int         b_deterministic; /* whether to allow non-deterministic optimizations when threaded */
     int         i_sync_lookahead; /* threaded lookahead buffer */