]> git.sesse.net Git - x264/commitdiff
Threaded lookahead
authorFiona Glaser <fiona@x264.com>
Tue, 8 May 2012 22:42:56 +0000 (15:42 -0700)
committerFiona Glaser <fiona@x264.com>
Fri, 18 May 2012 23:15:14 +0000 (16:15 -0700)
Split each lookahead frame analysis call into multiple threads.  Has a small
impact on quality, but does not seem to be consistently any worse.

This helps alleviate bottlenecks with many cores and frame threads. In many
case, this massively increases performance on many-core systems.  For example,
over 100% faster 1080p encoding with --preset veryfast on a 12-core i7 system.
Realtime 1080p30 at --preset slow should now be feasible on real systems.

For sliced-threads, this patch should be faster regardless of settings (~10%).

By default, lookahead threads are 1/6 of regular threads.  This isn't exacting,
but it seems to work well for all presets on real systems.  With sliced-threads,
it's the same as the number of encoding threads.

common/common.c
common/common.h
common/macroblock.c
common/threadpool.c
encoder/encoder.c
encoder/slicetype.c
x264.c
x264.h

index d03201d8f1ea2fcf3f8aa48b0757ecf382decd1a..3f40e66f11c205259ed9cc9226960bcb404043fb 100644 (file)
@@ -50,6 +50,7 @@ void x264_param_default( x264_param_t *param )
     /* CPU autodetect */
     param->cpu = x264_cpu_detect();
     param->i_threads = X264_THREADS_AUTO;
+    param->i_lookahead_threads = X264_THREADS_AUTO;
     param->b_deterministic = 1;
     param->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
 
@@ -632,6 +633,13 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
         else
             p->i_threads = atoi(value);
     }
+    OPT("lookahead-threads")
+    {
+        if( !strcmp(value, "auto") )
+            p->i_lookahead_threads = X264_THREADS_AUTO;
+        else
+            p->i_lookahead_threads = atoi(value);
+    }
     OPT("sliced-threads")
         p->b_sliced_threads = atobool(value);
     OPT("sync-lookahead")
@@ -1285,6 +1293,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
     s += sprintf( s, " fast_pskip=%d", p->analyse.b_fast_pskip );
     s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset );
     s += sprintf( s, " threads=%d", p->i_threads );
+    s += sprintf( s, " lookahead_threads=%d", p->i_lookahead_threads );
     s += sprintf( s, " sliced_threads=%d", p->b_sliced_threads );
     if( p->i_slice_count )
         s += sprintf( s, " slices=%d", p->i_slice_count );
index 5e3421291613becbd7e53fc5aa4fc830db5f3419..04ac11dae5274427c01153ceeb8933815ae13fdd 100644 (file)
@@ -56,6 +56,7 @@ do {\
 #define X264_BFRAME_MAX 16
 #define X264_REF_MAX 16
 #define X264_THREAD_MAX 128
+#define X264_LOOKAHEAD_THREAD_MAX 16
 #define X264_PCM_COST (FRAME_SIZE(256*BIT_DEPTH)+16)
 #define X264_LOOKAHEAD_MAX 250
 #define QP_BD_OFFSET (6*(BIT_DEPTH-8))
@@ -469,6 +470,7 @@ struct x264_t
     x264_param_t    param;
 
     x264_t          *thread[X264_THREAD_MAX+1];
+    x264_t          *lookahead_thread[X264_LOOKAHEAD_THREAD_MAX];
     int             b_thread_active;
     int             i_thread_phase; /* which thread to use for the next frame */
     int             i_thread_idx;   /* which thread this is */
@@ -476,6 +478,7 @@ struct x264_t
     int             i_threadslice_end; /* row after the end of this thread slice */
     int             i_threadslice_pass; /* which pass of encoding we are on */
     x264_threadpool_t *threadpool;
+    x264_threadpool_t *lookaheadpool;
     x264_pthread_mutex_t mutex;
     x264_pthread_cond_t cv;
 
@@ -915,6 +918,7 @@ struct x264_t
 
     /* Buffers that are allocated per-thread even in sliced threads. */
     void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
+    void *scratch_buffer2; /* if the first one's already in use */
     pixel *intra_border_backup[5][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
     /* Deblock strength values are stored for each 4x4 partition. In MBAFF
      * there are four extra values that need to be stored, located in [4][i]. */
index 8216799c305d78ae1452938275fbdd25303473d7..abce8f68862c90856b2c0c9fab866cac3fc76cda 100644 (file)
@@ -401,6 +401,9 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
     else
         h->scratch_buffer = NULL;
 
+    int buf_lookahead_threads = (h->mb.i_mb_height + (4 + 32) * h->param.i_lookahead_threads) * sizeof(int) * 2;
+    CHECKED_MALLOC( h->scratch_buffer2, buf_lookahead_threads );
+
     return 0;
 fail:
     return -1;
@@ -418,6 +421,7 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
                 x264_free( h->intra_border_backup[i][j] - 16 );
     }
     x264_free( h->scratch_buffer );
+    x264_free( h->scratch_buffer2 );
 }
 
 void x264_macroblock_slice_init( x264_t *h )
index f7a95fcce34ebd57d6342499c6f6a84827b1e4b7..a11bf9d259659f02e5589b39ab705b5d9883f327 100644 (file)
@@ -66,7 +66,7 @@ static void x264_threadpool_thread( x264_threadpool_t *pool )
         x264_pthread_mutex_unlock( &pool->run.mutex );
         if( !job )
             continue;
-        job->ret = job->func( job->arg ); /* execute the function */
+        job->ret = (void*)x264_stack_align( job->func, job->arg ); /* execute the function */
         x264_sync_frame_list_push( &pool->done, (void*)job );
     }
 }
@@ -83,7 +83,7 @@ int x264_threadpool_init( x264_threadpool_t **p_pool, int threads,
 
     pool->init_func = init_func;
     pool->init_arg  = init_arg;
-    pool->threads   = X264_MIN( threads, X264_THREAD_MAX );
+    pool->threads   = threads;
 
     CHECKED_MALLOC( pool->thread_handle, pool->threads * sizeof(x264_pthread_t) );
 
index 2ed1e75fb222cceddf4a645525dd2ed45dbb5c0a..f6246f91c5f0efdf286d6b1594d3337bb89072e6 100644 (file)
@@ -395,6 +395,15 @@ static void x264_encoder_thread_init( x264_t *h )
         x264_cpu_mask_misalign_sse();
 #endif
 }
+
+static void x264_lookahead_thread_init( x264_t *h )
+{
+#if HAVE_MMX
+    /* Misalign mask has to be set separately for each thread. */
+    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
+        x264_cpu_mask_misalign_sse();
+#endif
+}
 #endif
 
 /****************************************************************************
@@ -494,6 +503,9 @@ static int x264_validate_parameters( x264_t *h, int b_open )
 
     if( h->param.i_threads == X264_THREADS_AUTO )
         h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2;
+    if( h->param.i_lookahead_threads == X264_THREADS_AUTO )
+        h->param.i_lookahead_threads = h->param.i_threads / (h->param.b_sliced_threads?1:6);
+    int max_sliced_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 4 );
     if( h->param.i_threads > 1 )
     {
 #if !HAVE_THREAD
@@ -503,14 +515,15 @@ static int x264_validate_parameters( x264_t *h, int b_open )
         /* Avoid absurdly small thread slices as they can reduce performance
          * and VBV compliance.  Capped at an arbitrary 4 rows per thread. */
         if( h->param.b_sliced_threads )
-        {
-            int max_threads = (h->param.i_height+15)/16 / 4;
-            h->param.i_threads = X264_MIN( h->param.i_threads, max_threads );
-        }
+            h->param.i_threads = X264_MIN( h->param.i_threads, max_sliced_threads );
     }
     h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
+    h->param.i_lookahead_threads = x264_clip3( h->param.i_lookahead_threads, 1, X264_MIN( max_sliced_threads, X264_LOOKAHEAD_THREAD_MAX ) );
     if( h->param.i_threads == 1 )
+    {
         h->param.b_sliced_threads = 0;
+        h->param.i_lookahead_threads = 1;
+    }
     h->i_thread_frames = h->param.b_sliced_threads ? 1 : h->param.i_threads;
     if( h->i_thread_frames > 1 )
         h->param.nalu_process = NULL;
@@ -1271,10 +1284,19 @@ x264_t *x264_encoder_open( x264_param_t *param )
     if( h->param.i_threads > 1 &&
         x264_threadpool_init( &h->threadpool, h->param.i_threads, (void*)x264_encoder_thread_init, h ) )
         goto fail;
+    if( h->param.i_lookahead_threads > 1 &&
+        x264_threadpool_init( &h->lookaheadpool, h->param.i_lookahead_threads, (void*)x264_lookahead_thread_init, h ) )
+        goto fail;
 
     h->thread[0] = h;
     for( int i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
         CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
+    if( h->param.i_lookahead_threads > 1 )
+        for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+        {
+            CHECKED_MALLOC( h->lookahead_thread[i], sizeof(x264_t) );
+            *h->lookahead_thread[i] = *h;
+        }
 
     for( int i = 0; i < h->param.i_threads; i++ )
     {
@@ -3457,6 +3479,8 @@ void    x264_encoder_close  ( x264_t *h )
         x264_threadpool_wait_all( h );
     if( h->param.i_threads > 1 )
         x264_threadpool_delete( h->threadpool );
+    if( h->param.i_lookahead_threads > 1 )
+        x264_threadpool_delete( h->lookaheadpool );
     if( h->i_thread_frames > 1 )
     {
         for( int i = 0; i < h->i_thread_frames; i++ )
@@ -3766,6 +3790,10 @@ void    x264_encoder_close  ( x264_t *h )
                 if( h->thread[i]->fref[0][j] && h->thread[i]->fref[0][j]->b_duplicate )
                     x264_frame_delete( h->thread[i]->fref[0][j] );
 
+    if( h->param.i_lookahead_threads > 1 )
+        for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+            x264_free( h->lookahead_thread[i] );
+
     for( int i = h->param.i_threads - 1; i >= 0; i-- )
     {
         x264_frame_t **frame;
index f1c207f3949cbeee179e98662911a5549619fa04..4968f4f5bc2a41ea72010a50460a340f990e1781 100644 (file)
@@ -424,9 +424,21 @@ static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *r
     }
 }
 
+/* Output buffers are separated by 128 bytes to avoid false sharing of cachelines
+ * in multithreaded lookahead. */
+#define PAD_SIZE 32
+/* cost_est, cost_est_aq, intra_mbs, num rows */
+#define NUM_INTS 4
+#define COST_EST 0
+#define COST_EST_AQ 1
+#define INTRA_MBS 2
+#define NUM_ROWS 3
+#define ROW_SATD (NUM_INTS + (h->mb.i_mb_y - h->i_threadslice_start))
+
 static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
                                     x264_frame_t **frames, int p0, int p1, int b,
-                                    int dist_scale_factor, int do_search[2], const x264_weight_t *w )
+                                    int dist_scale_factor, int do_search[2], const x264_weight_t *w,
+                                    int *output_inter, int *output_intra )
 {
     x264_frame_t *fref0 = frames[p0];
     x264_frame_t *fref1 = frames[p1];
@@ -571,7 +583,7 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
 #define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; }
             if( i_mb_x < h->mb.i_mb_width - 1 )
                 MVC( fenc_mv[1] );
-            if( i_mb_y < h->mb.i_mb_height - 1 )
+            if( i_mb_y < h->i_threadslice_end - 1 )
             {
                 MVC( fenc_mv[i_mb_stride] );
                 if( i_mb_x > 0 )
@@ -653,11 +665,11 @@ lowres_intra_mb:
         int i_icost_aq = i_icost;
         if( h->param.rc.i_aq_mode )
             i_icost_aq = (i_icost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8;
-        fenc->i_row_satds[0][0][h->mb.i_mb_y] += i_icost_aq;
+        output_intra[ROW_SATD] += i_icost_aq;
         if( b_frame_score_mb )
         {
-            fenc->i_cost_est[0][0] += i_icost;
-            fenc->i_cost_est_aq[0][0] += i_icost_aq;
+            output_intra[COST_EST] += i_icost;
+            output_intra[COST_EST_AQ] += i_icost_aq;
         }
     }
     i_bcost += lowres_penalty;
@@ -674,7 +686,7 @@ lowres_intra_mb:
             list_used = 0;
         }
         if( b_frame_score_mb )
-            fenc->i_intra_mbs[b-p0] += b_intra;
+            output_inter[INTRA_MBS] += b_intra;
     }
 
     /* In an I-frame, we've already added the results above in the intra section. */
@@ -683,12 +695,12 @@ lowres_intra_mb:
         int i_bcost_aq = i_bcost;
         if( h->param.rc.i_aq_mode )
             i_bcost_aq = (i_bcost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8;
-        fenc->i_row_satds[b-p0][p1-b][h->mb.i_mb_y] += i_bcost_aq;
+        output_inter[ROW_SATD] += i_bcost_aq;
         if( b_frame_score_mb )
         {
             /* Don't use AQ-weighted costs for slicetype decision, only for ratecontrol. */
-            fenc->i_cost_est[b-p0][p1-b] += i_bcost;
-            fenc->i_cost_est_aq[b-p0][p1-b] += i_bcost_aq;
+            output_inter[COST_EST] += i_bcost;
+            output_inter[COST_EST_AQ] += i_bcost_aq;
         }
     }
 
@@ -701,6 +713,43 @@ lowres_intra_mb:
    (h->mb.i_mb_width - 2) * (h->mb.i_mb_height - 2) :\
     h->mb.i_mb_width * h->mb.i_mb_height)
 
+typedef struct
+{
+    x264_t *h;
+    x264_mb_analysis_t *a;
+    x264_frame_t **frames;
+    int p0;
+    int p1;
+    int b;
+    int dist_scale_factor;
+    int *do_search;
+    const x264_weight_t *w;
+    int *output_inter;
+    int *output_intra;
+} x264_slicetype_slice_t;
+
+static void x264_slicetype_slice_cost( x264_slicetype_slice_t *s )
+{
+    x264_t *h = s->h;
+
+    /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode.
+     * This considerably improves MV prediction overall. */
+
+    /* The edge mbs seem to reduce the predictive quality of the
+     * whole frame's score, but are needed for a spatial distribution. */
+    int do_edges = h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size || h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2;
+
+    int start_y = X264_MIN( h->i_threadslice_end - 1, h->mb.i_mb_height - 2 + do_edges );
+    int end_y = X264_MAX( h->i_threadslice_start, 1 - do_edges );
+    int start_x = h->mb.i_mb_width - 2 + do_edges;
+    int end_x = 1 - do_edges;
+
+    for( h->mb.i_mb_y = start_y; h->mb.i_mb_y >= end_y; h->mb.i_mb_y-- )
+        for( h->mb.i_mb_x = start_x; h->mb.i_mb_x >= end_x; h->mb.i_mb_x-- )
+            x264_slicetype_mb_cost( h, s->a, s->frames, s->p0, s->p1, s->b, s->dist_scale_factor,
+                                    s->do_search, s->w, s->output_inter, s->output_intra );
+}
+
 static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
                                       x264_frame_t **frames, int p0, int p1, int b,
                                       int b_intra_penalty )
@@ -708,77 +757,131 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
     int i_score = 0;
     int do_search[2];
     const x264_weight_t *w = x264_weight_none;
+    x264_frame_t *fenc = frames[b];
+
     /* Check whether we already evaluated this frame
      * If we have tried this frame as P, then we have also tried
      * the preceding frames as B. (is this still true?) */
     /* Also check that we already calculated the row SATDs for the current frame. */
-    if( frames[b]->i_cost_est[b-p0][p1-b] >= 0 && (!h->param.rc.i_vbv_buffer_size || frames[b]->i_row_satds[b-p0][p1-b][0] != -1) )
-        i_score = frames[b]->i_cost_est[b-p0][p1-b];
+    if( fenc->i_cost_est[b-p0][p1-b] >= 0 && (!h->param.rc.i_vbv_buffer_size || fenc->i_row_satds[b-p0][p1-b][0] != -1) )
+        i_score = fenc->i_cost_est[b-p0][p1-b];
     else
     {
         int dist_scale_factor = 128;
-        int *row_satd = frames[b]->i_row_satds[b-p0][p1-b];
-        int *row_satd_intra = frames[b]->i_row_satds[0][0];
 
         /* For each list, check to see whether we have lowres motion-searched this reference frame before. */
-        do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
-        do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
+        do_search[0] = b != p0 && fenc->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
+        do_search[1] = b != p1 && fenc->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
         if( do_search[0] )
         {
             if( h->param.analyse.i_weighted_pred && b == p1 )
             {
                 x264_emms();
-                x264_weights_analyse( h, frames[b], frames[p0], 1 );
-                w = frames[b]->weight[0];
+                x264_weights_analyse( h, fenc, frames[p0], 1 );
+                w = fenc->weight[0];
             }
-            frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+            fenc->lowres_mvs[0][b-p0-1][0][0] = 0;
         }
-        if( do_search[1] ) frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
+        if( do_search[1] ) fenc->lowres_mvs[1][p1-b-1][0][0] = 0;
 
-        if( b == p1 )
-            frames[b]->i_intra_mbs[b-p0] = 0;
-        if( !frames[b]->b_intra_calculated )
-        {
-            frames[b]->i_cost_est[0][0] = 0;
-            frames[b]->i_cost_est_aq[0][0] = 0;
-        }
         if( p1 != p0 )
             dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
 
-        frames[b]->i_cost_est[b-p0][p1-b] = 0;
-        frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
-
-        /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode.
-         * This considerably improves MV prediction overall. */
+        int output_buf_size = h->mb.i_mb_height + (NUM_INTS + PAD_SIZE) * h->param.i_lookahead_threads;
+        int *output_inter[X264_LOOKAHEAD_THREAD_MAX+1];
+        int *output_intra[X264_LOOKAHEAD_THREAD_MAX+1];
+        output_inter[0] = h->scratch_buffer2;
+        output_intra[0] = output_inter[0] + output_buf_size;
 
-        /* The edge mbs seem to reduce the predictive quality of the
-         * whole frame's score, but are needed for a spatial distribution. */
-        if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ||
-            h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2 )
+        if( h->param.i_lookahead_threads > 1 )
         {
-            for( h->mb.i_mb_y = h->mb.i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
+            x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX];
+
+            for( int i = 0; i < h->param.i_lookahead_threads; i++ )
             {
-                row_satd[h->mb.i_mb_y] = 0;
-                if( !frames[b]->b_intra_calculated )
-                    row_satd_intra[h->mb.i_mb_y] = 0;
-                for( h->mb.i_mb_x = h->mb.i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
-                    x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
+                x264_t *t = h->lookahead_thread[i];
+
+                /* FIXME move this somewhere else */
+                t->mb.i_me_method = h->mb.i_me_method;
+                t->mb.i_subpel_refine = h->mb.i_subpel_refine;
+                t->mb.b_chroma_me = h->mb.b_chroma_me;
+
+                s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
+                                                 output_inter[i], output_intra[i] };
+
+                t->i_threadslice_start = ((h->mb.i_mb_height *  i    + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
+                t->i_threadslice_end   = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
+
+                int thread_height = t->i_threadslice_end - t->i_threadslice_start;
+                int thread_output_size = thread_height + NUM_INTS;
+                memset( output_inter[i], 0, thread_output_size * sizeof(int) );
+                memset( output_intra[i], 0, thread_output_size * sizeof(int) );
+                output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height;
+
+                output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE;
+                output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE;
+
+                x264_threadpool_run( h->lookaheadpool, (void*)x264_slicetype_slice_cost, &s[i] );
             }
+            for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+                x264_threadpool_wait( h->lookaheadpool, &s[i] );
         }
         else
         {
-            for( h->mb.i_mb_y = h->mb.i_mb_height - 2; h->mb.i_mb_y >= 1; h->mb.i_mb_y-- )
-                for( h->mb.i_mb_x = h->mb.i_mb_width - 2; h->mb.i_mb_x >= 1; h->mb.i_mb_x-- )
-                    x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
+            h->i_threadslice_start = 0;
+            h->i_threadslice_end = h->mb.i_mb_height;
+            memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
+            memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
+            output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height;
+            x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
+                                                                 output_inter[0], output_intra[0] };
+            x264_slicetype_slice_cost( &s );
+        }
+
+        /* Sum up accumulators */
+        if( b == p1 )
+            fenc->i_intra_mbs[b-p0] = 0;
+        if( !fenc->b_intra_calculated )
+        {
+            fenc->i_cost_est[0][0] = 0;
+            fenc->i_cost_est_aq[0][0] = 0;
+        }
+        fenc->i_cost_est[b-p0][p1-b] = 0;
+        fenc->i_cost_est_aq[b-p0][p1-b] = 0;
+
+        int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b];
+        int *row_satd_intra = fenc->i_row_satds[0][0];
+        for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+        {
+            if( b == p1 )
+                fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS];
+            if( !fenc->b_intra_calculated )
+            {
+                fenc->i_cost_est[0][0] += output_intra[i][COST_EST];
+                fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ];
+            }
+
+            fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST];
+            fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ];
+
+            if( h->param.rc.i_vbv_buffer_size )
+            {
+                int row_count = output_inter[i][NUM_ROWS];
+                memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) );
+                if( !fenc->b_intra_calculated )
+                    memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) );
+                row_satd_inter += row_count;
+                row_satd_intra += row_count;
+            }
         }
 
-        i_score = frames[b]->i_cost_est[b-p0][p1-b];
+        i_score = fenc->i_cost_est[b-p0][p1-b];
         if( b != p1 )
             i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
         else
-            frames[b]->b_intra_calculated = 1;
+            fenc->b_intra_calculated = 1;
 
-        frames[b]->i_cost_est[b-p0][p1-b] = i_score;
+        fenc->i_cost_est[b-p0][p1-b] = i_score;
         x264_emms();
     }
 
@@ -786,7 +889,7 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
     {
         // arbitrary penalty for I-blocks after B-frames
         int nmb = NUM_MBS;
-        i_score += (uint64_t)i_score * frames[b]->i_intra_mbs[b-p0] / (nmb * 8);
+        i_score += (uint64_t)i_score * fenc->i_intra_mbs[b-p0] / (nmb * 8);
     }
     return i_score;
 }
diff --git a/x264.c b/x264.c
index 886ba370784cebb51c19403ca62cba7101fa5c56..b2b796673c05baaf9e43bec0f78ea90f565f9eee 100644 (file)
--- a/x264.c
+++ b/x264.c
@@ -797,6 +797,7 @@ static void help( x264_param_t *defaults, int longhelp )
     H1( "      --psnr                  Enable PSNR computation\n" );
     H1( "      --ssim                  Enable SSIM computation\n" );
     H1( "      --threads <integer>     Force a specific number of threads\n" );
+    H2( "      --lookahead-threads <integer> Force a specific number of lookahead threads\n" );
     H2( "      --sliced-threads        Low-latency but lower-efficiency threading\n" );
     H2( "      --thread-input          Run Avisynth in its own thread\n" );
     H2( "      --sync-lookahead <integer> Number of buffer frames for threaded lookahead\n" );
@@ -965,6 +966,7 @@ static struct option long_options[] =
     { "zones",       required_argument, NULL, 0 },
     { "qpfile",      required_argument, NULL, OPT_QPFILE },
     { "threads",     required_argument, NULL, 0 },
+    { "lookahead-threads", required_argument, NULL, 0 },
     { "sliced-threads",    no_argument, NULL, 0 },
     { "no-sliced-threads", no_argument, NULL, 0 },
     { "slice-max-size",    required_argument, NULL, 0 },
diff --git a/x264.h b/x264.h
index 3dcb386d898f5fbfd0f97b3f727cd6f7f9ce7986..f150151efa7e593015bb169045f2d8903828667e 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -41,7 +41,7 @@
 
 #include "x264_config.h"
 
-#define X264_BUILD 124
+#define X264_BUILD 125
 
 /* Application developers planning to link against a shared library version of
  * libx264 from a Microsoft Visual Studio or similar development environment
@@ -254,7 +254,8 @@ typedef struct x264_param_t
 {
     /* CPU flags */
     unsigned int cpu;
-    int         i_threads;       /* encode multiple frames in parallel */
+    int         i_threads;           /* encode multiple frames in parallel */
+    int         i_lookahead_threads; /* multiple threads for lookahead analysis */
     int         b_sliced_threads;  /* Whether to use slice-based threading. */
     int         b_deterministic; /* whether to allow non-deterministic optimizations when threaded */
     int         b_cpu_independent; /* force canonical behavior rather than cpu-dependent optimal algorithms */