]> git.sesse.net Git - x264/commitdiff
Threaded lookahead
authorSteven Walters <kemuri9@gmail.com>
Wed, 2 Sep 2009 01:46:51 +0000 (18:46 -0700)
committerFiona Glaser <fiona@x264.com>
Wed, 2 Sep 2009 04:06:20 +0000 (21:06 -0700)
Move lookahead into a separate thread, set to higher priority than the other threads, for optimal performance.
Reduces the amount that lookahead bottlenecks encoding, greatly increasing performance with lookahead-intensive settings (e.g. b-adapt 2) on many-core CPUs.
Buffer size can be controlled with --sync-lookahead, which defaults to auto (threads+bframes buffer size).
Note that this buffer is separate from the rc-lookahead value.
Note also that this does not split lookahead itself into multiple threads yet; this may be added in the future.
Additionally, split frames into "fdec" and "fenc" frame types and keep the two separate.
This split greatly reduces memory usage, which helps compensate for the larger lookahead size.
Extremely special thanks to Michael Kazmier and Alex Giladi of Avail Media, the original authors of this patch.

16 files changed:
Makefile
common/common.c
common/common.h
common/cpu.h
common/frame.c
common/frame.h
common/macroblock.c
common/osdep.h
common/x86/cpu-a.asm
encoder/analyse.h
encoder/encoder.c
encoder/lookahead.c [new file with mode: 0644]
encoder/ratecontrol.c
encoder/slicetype.c
x264.c
x264.h

index 0f34736ee1a981c6629a9ba0555e1e206d5d065d..04d639e8c10ba55bfc463ee4d613ea56cb4a3951 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -10,7 +10,7 @@ SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
        common/quant.c common/vlc.c \
        encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
        encoder/set.c encoder/macroblock.c encoder/cabac.c \
-       encoder/cavlc.c encoder/encoder.c
+       encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
 
 SRCCLI = x264.c matroska.c muxers.c
 
index c0a56e3d79bf7774913a76a5e1ce48dc48da0f50..461738700faf1596fcedb519539431d78c76a5f2 100644 (file)
@@ -45,6 +45,7 @@ void    x264_param_default( x264_param_t *param )
     param->cpu = x264_cpu_detect();
     param->i_threads = X264_THREADS_AUTO;
     param->b_deterministic = 1;
+    param->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
 
     /* Video properties */
     param->i_csp           = X264_CSP_I420;
@@ -276,6 +277,13 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
         else
             p->i_threads = atoi(value);
     }
+    OPT("sync-lookahead")
+    {
+        if( !strcmp(value, "auto") )
+            p->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
+        else
+            p->i_sync_lookahead = atoi(value);
+    }
     OPT2("deterministic", "n-deterministic")
         p->b_deterministic = atobool(value);
     OPT2("level", "level-idc")
index 574040d0f6dff90900403548aae5be3dd06c34c5..81c7b003334e50bfee0e5a0fe93d504116936aa4 100644 (file)
@@ -239,6 +239,19 @@ typedef struct
 
 } x264_slice_header_t;
 
+typedef struct x264_lookahead_t
+{
+    uint8_t                       b_thread_active;
+    uint8_t                       b_exit_thread;
+    uint8_t                       b_analyse_keyframe;
+    int                           i_last_idr;
+    int                           i_slicetype_length;
+    x264_frame_t                  *last_nonb;
+    x264_synch_frame_list_t       ifbuf;
+    x264_synch_frame_list_t       next;
+    x264_synch_frame_list_t       ofbuf;
+} x264_lookahead_t;
+
 /* From ffmpeg
  */
 #define X264_SCAN8_SIZE (6*8)
@@ -283,7 +296,7 @@ struct x264_t
     /* encoder parameters */
     x264_param_t    param;
 
-    x264_t          *thread[X264_THREAD_MAX];
+    x264_t          *thread[X264_THREAD_MAX+1];
     x264_pthread_t  thread_handle;
     int             b_thread_active;
     int             i_thread_phase; /* which thread to use for the next frame */
@@ -349,13 +362,9 @@ struct x264_t
     struct
     {
         /* Frames to be encoded (whose types have been decided) */
-        x264_frame_t *current[X264_LOOKAHEAD_MAX+3];
-        /* Temporary buffer (frames types not yet decided) */
-        x264_frame_t *next[X264_LOOKAHEAD_MAX+3];
-        /* Unused frames */
-        x264_frame_t *unused[X264_LOOKAHEAD_MAX + X264_THREAD_MAX*2 + 16+4];
-        /* For adaptive B decision */
-        x264_frame_t *last_nonb;
+        x264_frame_t **current;
+        /* Unused frames: 0 = fenc, 1 = fdec */
+        x264_frame_t **unused[2];
 
         /* frames used for reference + sentinels */
         x264_frame_t *reference[16+2];
@@ -667,6 +676,7 @@ struct x264_t
 #if VISUALIZE
     struct visualize_t *visualize;
 #endif
+    x264_lookahead_t *lookahead;
 };
 
 // included at the end because it needs x264_t
index 4380a3598b5860c8f6f3d2fcbd881c1816f44090..6901e1e18c99f8be9e6177b2b147c040a649e65a 100644 (file)
@@ -33,12 +33,12 @@ void     x264_cpu_mask_misalign_sse( void );
  * gcc 4.2 introduced __attribute__((force_align_arg_pointer)) to fix this
  * problem, but I don't want to require such a new version.
  * This applies only to x86_32, since other architectures that need alignment
- * also have ABIs that ensure aligned stack. */
+ * either have ABIs that ensure aligned stack, or don't support it at all. */
 #if defined(ARCH_X86) && defined(HAVE_MMX)
-int x264_stack_align( void (*func)(x264_t*), x264_t *arg );
-#define x264_stack_align(func,arg) x264_stack_align((void (*)(x264_t*))func,arg)
+int x264_stack_align( void (*func)(), ... );
+#define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__)
 #else
-#define x264_stack_align(func,arg) func(arg)
+#define x264_stack_align(func,...) func(__VA_ARGS__)
 #endif
 
 typedef struct {
index 99052602a07f871aca123b1ebdab4ce7991a894c..001c4fd9b7e20402cb74c0c7a6b316738e50ba64 100644 (file)
@@ -26,7 +26,7 @@
 
 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
 
-x264_frame_t *x264_frame_new( x264_t *h )
+x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
 {
     x264_frame_t *frame;
     int i, j;
@@ -60,9 +60,23 @@ x264_frame_t *x264_frame_new( x264_t *h )
         CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
         frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
     }
+
+    for( i = 0; i < h->param.i_bframe + 2; i++ )
+        for( j = 0; j < h->param.i_bframe + 2; j++ )
+            CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
+
+    frame->i_poc = -1;
+    frame->i_type = X264_TYPE_AUTO;
+    frame->i_qpplus1 = 0;
+    frame->i_pts = -1;
+    frame->i_frame = -1;
+    frame->i_frame_num = -1;
+    frame->i_lines_completed = -1;
+    frame->b_fdec = b_fdec;
+
     /* all 4 luma planes allocated together, since the cacheline split code
      * requires them to be in-phase wrt cacheline alignment. */
-    if( h->param.analyse.i_subpel_refine )
+    if( h->param.analyse.i_subpel_refine && b_fdec )
     {
         CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
         for( i = 0; i < 4; i++ )
@@ -75,77 +89,68 @@ x264_frame_t *x264_frame_new( x264_t *h )
         frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
     }
 
-    if( h->frames.b_have_lowres )
+    if( b_fdec ) /* fdec frame */
     {
-        frame->i_width_lowres = frame->i_width[0]/2;
-        frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
-        frame->i_lines_lowres = frame->i_lines[0]/2;
-
-        luma_plane_size = frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv );
-
-        CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
-        for( i = 0; i < 4; i++ )
-            frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
-
-        for( j = 0; j <= !!h->param.i_bframe; j++ )
-            for( i = 0; i <= h->param.i_bframe; i++ )
-            {
-                CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
-                CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
-            }
-        CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
-        for( j = 0; j <= h->param.i_bframe+1; j++ )
-            for( i = 0; i <= h->param.i_bframe+1; i++ )
-            {
-                CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
-                CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
-            }
-        frame->i_intra_cost = frame->lowres_costs[0][0];
-        memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
+        CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
+        CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
+        CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
+        if( h->param.i_bframe )
+        {
+            CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
+            CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
+        }
+        else
+        {
+            frame->mv[1]  = NULL;
+            frame->ref[1] = NULL;
+        }
+        CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
+        CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
+        if( h->param.analyse.i_me_method >= X264_ME_ESA )
+        {
+            CHECKED_MALLOC( frame->buffer[3],
+                            frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
+            frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
+        }
     }
-
-    if( h->param.analyse.i_me_method >= X264_ME_ESA )
+    else /* fenc frame */
     {
-        CHECKED_MALLOC( frame->buffer[3],
-                        frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
-        frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
-    }
-
-    frame->i_poc = -1;
-    frame->i_type = X264_TYPE_AUTO;
-    frame->i_qpplus1 = 0;
-    frame->i_pts = -1;
-    frame->i_frame = -1;
-    frame->i_frame_num = -1;
-    frame->i_lines_completed = -1;
+        if( h->frames.b_have_lowres )
+        {
+            frame->i_width_lowres = frame->i_width[0]/2;
+            frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
+            frame->i_lines_lowres = frame->i_lines[0]/2;
 
-    CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
-    CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
-    CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
-    if( h->param.i_bframe )
-    {
-        CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
-        CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
-    }
-    else
-    {
-        frame->mv[1]  = NULL;
-        frame->ref[1] = NULL;
-    }
+            luma_plane_size = frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv );
 
-    CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
-    CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
-    for( i = 0; i < h->param.i_bframe + 2; i++ )
-        for( j = 0; j < h->param.i_bframe + 2; j++ )
-            CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
+            CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
+            for( i = 0; i < 4; i++ )
+                frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
 
-    if( h->param.rc.i_aq_mode )
-    {
-        CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
-        CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
-        if( h->frames.b_have_lowres )
-            /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
-            CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
+            for( j = 0; j <= !!h->param.i_bframe; j++ )
+                for( i = 0; i <= h->param.i_bframe; i++ )
+                {
+                    CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
+                    CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
+                }
+            CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
+            for( j = 0; j <= h->param.i_bframe+1; j++ )
+                for( i = 0; i <= h->param.i_bframe+1; i++ )
+                {
+                    CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
+                    CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
+                }
+            frame->i_intra_cost = frame->lowres_costs[0][0];
+            memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
+        }
+        if( h->param.rc.i_aq_mode )
+        {
+            CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
+            CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
+            if( h->frames.b_have_lowres )
+                /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
+                CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
+        }
     }
 
     if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
@@ -971,19 +976,19 @@ void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
     assert( frame->i_reference_count > 0 );
     frame->i_reference_count--;
     if( frame->i_reference_count == 0 )
-        x264_frame_push( h->frames.unused, frame );
-    assert( h->frames.unused[ sizeof(h->frames.unused) / sizeof(*h->frames.unused) - 1 ] == NULL );
+        x264_frame_push( h->frames.unused[frame->b_fdec], frame );
 }
 
-x264_frame_t *x264_frame_pop_unused( x264_t *h )
+x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
 {
     x264_frame_t *frame;
-    if( h->frames.unused[0] )
-        frame = x264_frame_pop( h->frames.unused );
+    if( h->frames.unused[b_fdec][0] )
+        frame = x264_frame_pop( h->frames.unused[b_fdec] );
     else
-        frame = x264_frame_new( h );
+        frame = x264_frame_new( h, b_fdec );
     if( !frame )
         return NULL;
+    frame->b_last_minigop_bframe = 0;
     frame->i_reference_count = 1;
     frame->b_intra_calculated = 0;
     return frame;
@@ -1008,3 +1013,54 @@ void x264_frame_sort( x264_frame_t **list, int b_dts )
         }
     } while( !b_ok );
 }
+
+void x264_frame_delete_list( x264_frame_t **list )
+{
+    int i = 0;
+    while( list[i] )
+        x264_frame_delete( list[i++] );
+    x264_free( list );
+}
+
+int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int max_size )
+{
+    if( max_size < 0 )
+        return -1;
+    slist->i_max_size = max_size;
+    slist->i_size = 0;
+    CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
+    if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
+        x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
+        x264_pthread_cond_init( &slist->cv_empty, NULL ) )
+        return -1;
+    return 0;
+fail:
+    return -1;
+}
+
+void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist )
+{
+    x264_pthread_mutex_destroy( &slist->mutex );
+    x264_pthread_cond_destroy( &slist->cv_fill );
+    x264_pthread_cond_destroy( &slist->cv_empty );
+    x264_frame_delete_list( slist->list );
+}
+
+void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame )
+{
+    x264_pthread_mutex_lock( &slist->mutex );
+    while( slist->i_size == slist->i_max_size )
+        x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
+    slist->list[ slist->i_size++ ] = frame;
+    x264_pthread_mutex_unlock( &slist->mutex );
+    x264_pthread_cond_broadcast( &slist->cv_fill );
+}
+
+int x264_synch_frame_list_get_size( x264_synch_frame_list_t *slist )
+{
+    int size;
+    x264_pthread_mutex_lock( &slist->mutex );
+    size = slist->i_size;
+    x264_pthread_mutex_unlock( &slist->mutex );
+    return size;
+}
index 9ca83f9391a3ce92d5d3fd5ae1be2add43a9d69a..f6faa12b003dd89635dfdcdf1192fcec96b345a1 100644 (file)
@@ -40,6 +40,9 @@ typedef struct
     int     i_frame;    /* Presentation frame number */
     int     i_frame_num; /* Coded frame number */
     int     b_kept_as_ref;
+    uint8_t b_fdec;
+    uint8_t b_last_minigop_bframe; /* this frame is the last b in a sequence of bframes */
+    uint8_t i_bframes;   /* number of bframes following this nonb in coded order */
     float   f_qp_avg_rc; /* QPs as decided by ratecontrol */
     float   f_qp_avg_aq; /* QPs as decided by AQ in addition to ratecontrol */
 
@@ -104,6 +107,17 @@ typedef struct
 
 } x264_frame_t;
 
+/* synchronized frame list */
+typedef struct
+{
+   x264_frame_t **list;
+   int i_max_size;
+   int i_size;
+   x264_pthread_mutex_t     mutex;
+   x264_pthread_cond_t      cv_fill;  /* event signaling that the list became fuller */
+   x264_pthread_cond_t      cv_empty; /* event signaling that the list became emptier */
+} x264_synch_frame_list_t;
+
 typedef void (*x264_deblock_inter_t)( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 typedef void (*x264_deblock_intra_t)( uint8_t *pix, int stride, int alpha, int beta );
 typedef struct
@@ -118,7 +132,7 @@ typedef struct
     x264_deblock_intra_t deblock_h_chroma_intra;
 } x264_deblock_function_t;
 
-x264_frame_t *x264_frame_new( x264_t *h );
+x264_frame_t *x264_frame_new( x264_t *h, int b_fdec );
 void          x264_frame_delete( x264_frame_t *frame );
 
 int           x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );
@@ -144,8 +158,15 @@ x264_frame_t *x264_frame_pop( x264_frame_t **list );
 void          x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame );
 x264_frame_t *x264_frame_shift( x264_frame_t **list );
 void          x264_frame_push_unused( x264_t *h, x264_frame_t *frame );
-x264_frame_t *x264_frame_pop_unused( x264_t *h );
+x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec );
 void          x264_frame_sort( x264_frame_t **list, int b_dts );
+void          x264_frame_delete_list( x264_frame_t **list );
+
+int           x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int nelem );
+void          x264_synch_frame_list_delete( x264_synch_frame_list_t *slist );
+void          x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame );
+int           x264_synch_frame_list_get_size( x264_synch_frame_list_t *slist );
+
 #define x264_frame_sort_dts(list) x264_frame_sort(list, 1)
 #define x264_frame_sort_pts(list) x264_frame_sort(list, 0)
 
index 790dde22b82e499da96b377198670c99b9ff52ed..6e866d4c04ed7e52730ef35991d92a66db4fe91a 100644 (file)
@@ -703,7 +703,7 @@ int x264_macroblock_cache_init( x264_t *h )
         for( j=0; j<3; j++ )
         {
             /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
-            CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], h->fdec->i_stride[j] );
+            CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
             h->mb.intra_border_backup[i][j] += 8;
         }
 
index 696bbc9d015a602ee81de8abceffdf79f5b3b29e..9d6a1e635bc2942b0d6bc687e954dc900b8a8ebf 100644 (file)
@@ -137,6 +137,9 @@ static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(vo
 #define x264_pthread_cond_destroy    pthread_cond_destroy
 #define x264_pthread_cond_broadcast  pthread_cond_broadcast
 #define x264_pthread_cond_wait       pthread_cond_wait
+#define x264_pthread_attr_t          pthread_attr_t
+#define x264_pthread_attr_init       pthread_attr_init
+#define x264_pthread_attr_destroy    pthread_attr_destroy
 #else
 #define x264_pthread_mutex_t         int
 #define x264_pthread_mutex_init(m,f) 0
@@ -148,6 +151,9 @@ static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(vo
 #define x264_pthread_cond_destroy(c)
 #define x264_pthread_cond_broadcast(c)
 #define x264_pthread_cond_wait(c,m)
+#define x264_pthread_attr_t          int
+#define x264_pthread_attr_init(a)    0
+#define x264_pthread_attr_destroy(a)
 #endif
 
 #define WORD_SIZE sizeof(void*)
@@ -216,4 +222,11 @@ static int ALWAYS_INLINE x264_clz( uint32_t x )
 }
 #endif
 
+#if defined(SYS_LINUX) && defined(HAVE_PTHREAD)
+#include <unistd.h>
+#define x264_lower_thread_priority(p) { UNUSED int nice_ret = nice(p); }
+#else
+#define x264_lower_thread_priority(p)
+#endif
+
 #endif /* X264_OSDEP_H */
index 2df98fd47bf340dd4bcaa4428fd93b63ac32446e..285111a9c91417d011ee83e99a6ab290c82f91fe 100644 (file)
@@ -96,11 +96,13 @@ cglobal x264_cpu_cpuid, 0,6
 cglobal x264_stack_align
     push ebp
     mov  ebp, esp
-    sub  esp, 4
+    sub  esp, 8
     and  esp, ~15
     mov  ecx, [ebp+8]
     mov  edx, [ebp+12]
     mov  [esp], edx
+    mov  edx, [ebp+16]
+    mov  [esp+4], edx
     call ecx
     leave
     ret
index a2a04a557670caed15bc84220612e72e8428fbe0..05aae40d00546e03a5a6ba15335a507e8173c2ac 100644 (file)
@@ -28,4 +28,12 @@ int  x264_macroblock_analyse( x264_t *h );
 void x264_slicetype_decide( x264_t *h );
 int  x264_lowres_context_alloc( x264_t *h );
 
+void x264_slicetype_analyse( x264_t *h, int keyframe );
+
+int  x264_lookahead_init( x264_t *h, int i_slicetype_length );
+int  x264_lookahead_is_empty( x264_t *h );
+void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame );
+void x264_lookahead_get_frames( x264_t *h );
+void x264_lookahead_delete( x264_t *h );
+
 #endif
index eb6c435040c8f6465c13b475b065a15d08e2072f..c6b3398049cfd284a7b0e1368347f3f897bd97bd 100644 (file)
@@ -364,7 +364,7 @@ static int x264_validate_parameters( x264_t *h )
         return -1;
     }
 
-    if( h->param.i_threads == 0 )
+    if( h->param.i_threads == X264_THREADS_AUTO )
         h->param.i_threads = x264_cpu_num_processors() * 3/2;
     h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
     if( h->param.i_threads > 1 )
@@ -519,6 +519,14 @@ static int x264_validate_parameters( x264_t *h )
         h->param.rc.b_mb_tree = 0;
     if( h->param.rc.f_qcompress == 1 )
         h->param.rc.b_mb_tree = 0;
+#ifdef HAVE_PTHREAD
+    if( h->param.i_sync_lookahead )
+        h->param.i_sync_lookahead = x264_clip3( h->param.i_sync_lookahead, h->param.i_threads + h->param.i_bframe, X264_LOOKAHEAD_MAX );
+    if( h->param.rc.b_stat_read || h->param.i_threads == 1 )
+        h->param.i_sync_lookahead = 0;
+#else
+    h->param.i_sync_lookahead = 0;
+#endif
 
     h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
                                 && h->param.i_bframe
@@ -740,7 +748,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
 {
     x264_t *h;
     char buf[1000], *p;
-    int i;
+    int i, i_slicetype_length;
 
     CHECKED_MALLOCZERO( h, sizeof(x264_t) );
 
@@ -793,8 +801,10 @@ x264_t *x264_encoder_open( x264_param_t *param )
         h->frames.i_delay = h->param.i_bframe;
     if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size )
         h->frames.i_delay = X264_MAX( h->frames.i_delay, h->param.rc.i_lookahead );
+    i_slicetype_length = h->frames.i_delay;
     h->frames.i_delay += h->param.i_threads - 1;
     h->frames.i_delay = X264_MIN( h->frames.i_delay, X264_LOOKAHEAD_MAX );
+    h->frames.i_delay += h->param.i_sync_lookahead;
 
     h->frames.i_max_ref0 = h->param.i_frame_reference;
     h->frames.i_max_ref1 = h->sps->vui.i_num_reorder_frames;
@@ -810,7 +820,12 @@ x264_t *x264_encoder_open( x264_param_t *param )
 
     h->frames.i_last_idr = - h->param.i_keyint_max;
     h->frames.i_input    = 0;
-    h->frames.last_nonb  = NULL;
+
+    CHECKED_MALLOCZERO( h->frames.unused[0], (h->frames.i_delay + 3) * sizeof(x264_frame_t *) );
+    /* Allocate room for max refs plus a few extra just in case. */
+    CHECKED_MALLOCZERO( h->frames.unused[1], (h->param.i_threads + 20) * sizeof(x264_frame_t *) );
+    CHECKED_MALLOCZERO( h->frames.current, (h->param.i_sync_lookahead + h->param.i_bframe
+                        + h->param.i_threads + 3) * sizeof(x264_frame_t *) );
 
     h->i_ref0 = 0;
     h->i_ref1 = 0;
@@ -861,14 +876,14 @@ x264_t *x264_encoder_open( x264_param_t *param )
 
     h->thread[0] = h;
     h->i_thread_num = 0;
-    for( i = 1; i < h->param.i_threads; i++ )
+    for( i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
         CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
 
     for( i = 0; i < h->param.i_threads; i++ )
     {
         if( i > 0 )
             *h->thread[i] = *h;
-        h->thread[i]->fdec = x264_frame_pop_unused( h );
+        h->thread[i]->fdec = x264_frame_pop_unused( h, 1 );
         if( !h->thread[i]->fdec )
             goto fail;
         CHECKED_MALLOC( h->thread[i]->out.p_bitstream, h->out.i_bitstream );
@@ -879,6 +894,9 @@ x264_t *x264_encoder_open( x264_param_t *param )
             goto fail;
     }
 
+    if( x264_lookahead_init( h, i_slicetype_length ) )
+        goto fail;
+
     if( x264_ratecontrol_new( h ) < 0 )
         goto fail;
 
@@ -1181,8 +1199,6 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
 
 static inline int x264_reference_update( x264_t *h )
 {
-    int i;
-
     if( h->fdec->i_frame >= 0 )
         h->i_frame++;
 
@@ -1191,29 +1207,18 @@ static inline int x264_reference_update( x264_t *h )
         if( h->param.i_threads > 1 )
         {
             x264_frame_push_unused( h, h->fdec );
-            h->fdec = x264_frame_pop_unused( h );
+            h->fdec = x264_frame_pop_unused( h, 1 );
             if( !h->fdec )
                 return -1;
         }
         return 0;
     }
 
-    /* move lowres copy of the image to the ref frame */
-    for( i = 0; i < 4; i++)
-    {
-        XCHG( uint8_t*, h->fdec->lowres[i], h->fenc->lowres[i] );
-        XCHG( uint8_t*, h->fdec->buffer_lowres[i], h->fenc->buffer_lowres[i] );
-    }
-
-    /* adaptive B decision needs a pointer, since it can't use the ref lists */
-    if( h->sh.i_type != SLICE_TYPE_B )
-        h->frames.last_nonb = h->fdec;
-
     /* move frame in the buffer */
     x264_frame_push( h->frames.reference, h->fdec );
     if( h->frames.reference[h->frames.i_max_dpb] )
         x264_frame_push_unused( h, x264_frame_shift( h->frames.reference ) );
-    h->fdec = x264_frame_pop_unused( h );
+    h->fdec = x264_frame_pop_unused( h, 1 );
     if( !h->fdec )
         return -1;
     return 0;
@@ -1516,6 +1521,8 @@ static void *x264_slices_write( x264_t *h )
 {
     int i_frame_size = 0;
     int i_slice_num = 0;
+    if( h->param.i_sync_lookahead )
+        x264_lower_thread_priority( 10 );
 
 #ifdef HAVE_MMX
     /* Misalign mask has to be set separately for each thread. */
@@ -1619,7 +1626,7 @@ int     x264_encoder_encode( x264_t *h,
     if( pic_in != NULL )
     {
         /* 1: Copy the picture to a frame and move it to a buffer */
-        x264_frame_t *fenc = x264_frame_pop_unused( h );
+        x264_frame_t *fenc = x264_frame_pop_unused( h, 0 );
         if( !fenc )
             return -1;
 
@@ -1632,8 +1639,6 @@ int     x264_encoder_encode( x264_t *h,
 
         fenc->i_frame = h->frames.i_input++;
 
-        x264_frame_push( h->frames.next, fenc );
-
         if( h->frames.b_have_lowres )
             x264_frame_init_lowres( h, fenc );
 
@@ -1645,55 +1650,33 @@ int     x264_encoder_encode( x264_t *h,
         else if( h->param.rc.i_aq_mode )
             x264_adaptive_quant_frame( h, fenc );
 
+        /* 2: Place the frame into the queue for its slice type decision */
+        x264_lookahead_put_frame( h, fenc );
+
         if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads )
         {
-            /* Nothing yet to encode */
-            /* waiting for filling bframe buffer */
+            /* Nothing yet to encode, waiting for filling of buffers */
             pic_out->i_type = X264_TYPE_AUTO;
             return 0;
         }
     }
-
-    if( h->frames.current[0] == NULL )
+    else
     {
-        int bframes = 0;
-        /* 2: Select frame types */
-        if( h->frames.next[0] == NULL )
-        {
-            if( x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out ) < 0 )
-                return -1;
-            return 0;
-        }
+        /* signal kills for lookahead thread */
+        h->lookahead->b_exit_thread = 1;
+        x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
+    }
 
-        x264_stack_align( x264_slicetype_decide, h );
+    /* 3: The picture is analyzed in the lookahead */
+    if( !h->frames.current[0] )
+        x264_lookahead_get_frames( h );
 
-        /* 3: move some B-frames and 1 non-B to encode queue */
-        while( IS_X264_TYPE_B( h->frames.next[bframes]->i_type ) )
-            bframes++;
-        x264_frame_push( h->frames.current, x264_frame_shift( &h->frames.next[bframes] ) );
-        /* FIXME: when max B-frames > 3, BREF may no longer be centered after GOP closing */
-        if( h->param.b_bframe_pyramid && bframes > 1 )
-        {
-            x264_frame_t *mid = x264_frame_shift( &h->frames.next[bframes/2] );
-            mid->i_type = X264_TYPE_BREF;
-            x264_frame_push( h->frames.current, mid );
-            bframes--;
-        }
-        while( bframes-- )
-            x264_frame_push( h->frames.current, x264_frame_shift( h->frames.next ) );
-    }
+    if( !h->frames.current[0] && x264_lookahead_is_empty( h ) )
+        return x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
 
     /* ------------------- Get frame to be encoded ------------------------- */
     /* 4: get picture to encode */
     h->fenc = x264_frame_shift( h->frames.current );
-    if( h->fenc == NULL )
-    {
-        /* Nothing yet to encode (ex: waiting for I/P with B frames) */
-        /* waiting for filling bframe buffer */
-        pic_out->i_type = X264_TYPE_AUTO;
-        return 0;
-    }
-
     if( h->fenc->param )
     {
         x264_encoder_reconfig( h, h->fenc->param );
@@ -1704,6 +1687,7 @@ int     x264_encoder_encode( x264_t *h,
     if( h->fenc->i_type == X264_TYPE_IDR )
     {
         h->frames.i_last_idr = h->fenc->i_frame;
+        h->i_frame_num = 0;
     }
 
     /* ------------------- Setup frame context ----------------------------- */
@@ -2029,6 +2013,8 @@ void    x264_encoder_close  ( x264_t *h )
                    || h->stat.i_mb_count[SLICE_TYPE_P][I_PCM]
                    || h->stat.i_mb_count[SLICE_TYPE_B][I_PCM];
 
+    x264_lookahead_delete( h );
+
     for( i=0; i<h->param.i_threads; i++ )
     {
         // don't strictly have to wait for the other threads, but it's simpler than canceling them
@@ -2248,21 +2234,9 @@ void    x264_encoder_close  ( x264_t *h )
         h = h->thread[ h->i_thread_phase % h->param.i_threads ];
 
     /* frames */
-    for( i = 0; h->frames.current[i]; i++ )
-    {
-        assert( h->frames.current[i]->i_reference_count == 1 );
-        x264_frame_delete( h->frames.current[i] );
-    }
-    for( i = 0; h->frames.next[i]; i++ )
-    {
-        assert( h->frames.next[i]->i_reference_count == 1 );
-        x264_frame_delete( h->frames.next[i] );
-    }
-    for( i = 0; h->frames.unused[i]; i++ )
-    {
-        assert( h->frames.unused[i]->i_reference_count == 0 );
-        x264_frame_delete( h->frames.unused[i] );
-    }
+    x264_frame_delete_list( h->frames.unused[0] );
+    x264_frame_delete_list( h->frames.unused[1] );
+    x264_frame_delete_list( h->frames.current );
 
     h = h->thread[0];
 
@@ -2302,7 +2276,8 @@ int x264_encoder_delayed_frames( x264_t *h )
     h = h->thread[ h->i_thread_phase % h->param.i_threads ];
     for( i=0; h->frames.current[i]; i++ )
         delayed_frames++;
-    for( i=0; h->frames.next[i]; i++ )
-        delayed_frames++;
+    delayed_frames += x264_synch_frame_list_get_size( &h->lookahead->ifbuf );
+    delayed_frames += x264_synch_frame_list_get_size( &h->lookahead->next );
+    delayed_frames += x264_synch_frame_list_get_size( &h->lookahead->ofbuf );
     return delayed_frames;
 }
diff --git a/encoder/lookahead.c b/encoder/lookahead.c
new file mode 100644 (file)
index 0000000..9df0ce3
--- /dev/null
@@ -0,0 +1,278 @@
+/*****************************************************************************
+ * lookahead.c: Lookahead slicetype decisions for x264
+ *****************************************************************************
+ * Lookahead.c and associated modifications:
+ *     Copyright (C) 2008 Avail Media
+ *
+ * Authors: Michael Kazmier <mkazmier@availmedia.com>
+ *          Alex Giladi <agiladi@availmedia.com>
+ *          Steven Walters <kemuri9@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+/* LOOKAHEAD (threaded and non-threaded mode)
+ *
+ * Lookahead types:
+ *     [1] Slice type / scene cut;
+ *
+ * In non-threaded mode, we run the existing slicetype decision code as it was.
+ * In threaded mode, we run in a separate thread, that lives between the calls
+ * to x264_encoder_open() and x264_encoder_close(), and performs lookahead for
+ * the number of frames specified in rc_lookahead.  Recommended setting is
+ * # of bframes + # of threads.
+ */
+#include "common/common.h"
+#include "common/cpu.h"
+#include "analyse.h"
+
+static void x264_lookahead_shift( x264_synch_frame_list_t *dst, x264_synch_frame_list_t *src, int count )
+{
+    int i = count;
+    while( i-- )
+    {
+        assert( dst->i_size != dst->i_max_size );
+        assert( src->i_size );
+        dst->list[ dst->i_size++ ] = x264_frame_shift( src->list );
+        src->i_size--;
+    }
+    if( count )
+    {
+        x264_pthread_cond_broadcast( &dst->cv_fill );
+        x264_pthread_cond_broadcast( &src->cv_empty );
+    }
+}
+
+static void x264_lookahead_update_last_nonb( x264_t *h, x264_frame_t *new_nonb )
+{
+    if( h->lookahead->last_nonb )
+        x264_frame_push_unused( h, h->lookahead->last_nonb );
+    h->lookahead->last_nonb = new_nonb;
+    new_nonb->i_reference_count++;
+}
+
+#ifdef HAVE_PTHREAD
+static void x264_lookahead_slicetype_decide( x264_t *h )
+{
+    int bframes = 0;
+    x264_stack_align( x264_slicetype_decide, h );
+
+    while( IS_X264_TYPE_B( h->lookahead->next.list[bframes]->i_type ) )
+        bframes++;
+    x264_lookahead_update_last_nonb( h, h->lookahead->next.list[bframes] );
+
+    x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+    while( h->lookahead->ofbuf.i_size == h->lookahead->ofbuf.i_max_size )
+        x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_empty, &h->lookahead->ofbuf.mutex );
+
+    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+    x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, bframes + 1 );
+    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+
+    /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
+    if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
+        x264_stack_align( x264_slicetype_analyse, h, 1 );
+
+    x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
+}
+
+static void x264_lookahead_thread( x264_t *h )
+{
+    int shift;
+#ifdef HAVE_MMX
+    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
+        x264_cpu_mask_misalign_sse();
+#endif
+    h->lookahead->b_thread_active = 1;
+    while( !h->lookahead->b_exit_thread )
+    {
+        x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+        x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+        shift = X264_MIN( h->lookahead->next.i_max_size - h->lookahead->next.i_size, h->lookahead->ifbuf.i_size );
+        x264_lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, shift );
+        x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+        if( h->lookahead->next.i_size <= h->lookahead->i_slicetype_length )
+        {
+            while( !h->lookahead->ifbuf.i_size && !h->lookahead->b_exit_thread )
+                x264_pthread_cond_wait( &h->lookahead->ifbuf.cv_fill, &h->lookahead->ifbuf.mutex );
+            x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+        }
+        else
+        {
+            x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+            x264_lookahead_slicetype_decide( h );
+        }
+    }   /* end of input frames */
+    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+    x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+    x264_lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, h->lookahead->ifbuf.i_size );
+    x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+    while( h->lookahead->next.i_size )
+        x264_lookahead_slicetype_decide( h );
+    h->lookahead->b_thread_active = 0;
+}
+#endif
+
+int x264_lookahead_init( x264_t *h, int i_slicetype_length )
+{
+    x264_lookahead_t *look;
+    CHECKED_MALLOCZERO( look, sizeof(x264_lookahead_t) );
+    int i;
+    for( i = 0; i < h->param.i_threads; i++ )
+        h->thread[i]->lookahead = look;
+
+    look->i_last_idr = - h->param.i_keyint_max;
+    look->b_analyse_keyframe = (h->param.rc.b_mb_tree || (h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead))
+                               && !h->param.rc.b_stat_read;
+    look->i_slicetype_length = i_slicetype_length;
+
+    /* init frame lists */
+    if( x264_synch_frame_list_init( &look->ifbuf, h->param.i_sync_lookahead+3 ) ||
+        x264_synch_frame_list_init( &look->next, h->frames.i_delay+3 ) ||
+        x264_synch_frame_list_init( &look->ofbuf, h->frames.i_delay+3 ) )
+        goto fail;
+
+    if( !h->param.i_sync_lookahead )
+        return 0;
+
+    x264_t *look_h = h->thread[h->param.i_threads];
+    *look_h = *h;
+    if( x264_macroblock_cache_init( look_h ) )
+        goto fail;
+
+    UNUSED x264_pthread_attr_t attr;
+    if( x264_pthread_attr_init( &attr ) )
+        goto fail;
+#if defined(USE_REAL_PTHREAD) && !defined(SYS_LINUX)
+    int offset = sched_get_priority_max( SCHED_OTHER );
+    x264_log( h, X264_LOG_DEBUG, "setting priority of lookahead thread to %d\n", offset );
+    struct sched_param sp;
+    pthread_attr_getschedparam( &attr, &sp );
+    sp.sched_priority = offset;
+    pthread_attr_setschedparam( &attr, &sp );
+#endif
+
+    if( x264_pthread_create( &look_h->thread_handle, &attr, (void *)x264_lookahead_thread, look_h ) )
+        goto fail;
+
+    x264_pthread_attr_destroy( &attr );
+
+    return 0;
+fail:
+    x264_free( look );
+    return -1;
+}
+
+void x264_lookahead_delete( x264_t *h )
+{
+    if( h->param.i_sync_lookahead )
+    {
+        h->lookahead->b_exit_thread = 1;
+        x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
+        x264_pthread_join( h->thread[h->param.i_threads]->thread_handle, NULL );
+        x264_macroblock_cache_end( h->thread[h->param.i_threads] );
+        x264_free( h->thread[h->param.i_threads] );
+    }
+    x264_synch_frame_list_delete( &h->lookahead->ifbuf );
+    x264_synch_frame_list_delete( &h->lookahead->next );
+    x264_synch_frame_list_delete( &h->lookahead->ofbuf );
+    if( h->lookahead->last_nonb )
+        x264_frame_delete( h->lookahead->last_nonb );
+    x264_free( h->lookahead );
+}
+
+void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame )
+{
+    if( h->param.i_sync_lookahead )
+        x264_synch_frame_list_push( &h->lookahead->ifbuf, frame );
+    else
+        x264_synch_frame_list_push( &h->lookahead->next, frame );
+}
+
+int x264_lookahead_is_empty( x264_t *h )
+{
+    return !x264_synch_frame_list_get_size( &h->lookahead->ofbuf ) &&
+           !x264_synch_frame_list_get_size( &h->lookahead->next );
+}
+
+static void x264_lookahead_encoder_shift( x264_t *h )
+{
+    int bframes  = 0;
+    int i_frames = 0;
+
+    while( h->lookahead->ofbuf.list[i_frames] )
+    {
+        while( h->lookahead->b_thread_active && !h->lookahead->ofbuf.i_size )
+            x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_fill, &h->lookahead->ofbuf.mutex );
+        if( IS_X264_TYPE_B( h->lookahead->ofbuf.list[bframes]->i_type ) )
+            bframes++;
+        else
+            break;
+        i_frames++;
+    }
+    if( h->lookahead->ofbuf.list[i_frames] )
+    {
+        x264_frame_push( h->frames.current, x264_frame_shift( &h->lookahead->ofbuf.list[bframes] ) );
+        h->lookahead->ofbuf.i_size--;
+        if( h->param.b_bframe_pyramid && bframes > 1 )
+        {
+            x264_frame_t *mid = x264_frame_shift( &h->lookahead->ofbuf.list[bframes/2] );
+            h->lookahead->ofbuf.i_size--;
+            mid->i_type = X264_TYPE_BREF;
+            x264_frame_push( h->frames.current, mid );
+            bframes--;
+        }
+        while( bframes-- )
+        {
+            x264_frame_push( h->frames.current, x264_frame_shift( h->lookahead->ofbuf.list ) );
+            h->lookahead->ofbuf.i_size--;
+        }
+        x264_pthread_cond_broadcast( &h->lookahead->ofbuf.cv_empty );
+    }
+}
+
+void x264_lookahead_get_frames( x264_t *h )
+{
+    if( h->param.i_sync_lookahead )
+    {   /* We have a lookahead thread, so get frames from there */
+        x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+        while( !h->lookahead->ofbuf.i_size && h->lookahead->b_thread_active )
+            x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_fill, &h->lookahead->ofbuf.mutex );
+        x264_lookahead_encoder_shift( h );
+        x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
+    }
+    else
+    {   /* We are not running a lookahead thread, so perform all the slicetype decide on the fly */
+
+        if( h->frames.current[0] || !h->lookahead->next.i_size )
+            return;
+
+        x264_stack_align( x264_slicetype_decide, h );
+
+        int bframes=0;
+        while( IS_X264_TYPE_B( h->lookahead->next.list[bframes]->i_type ) )
+            bframes++;
+
+        x264_lookahead_update_last_nonb( h, h->lookahead->next.list[bframes] );
+        x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, bframes + 1 );
+
+        /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
+        if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
+            x264_stack_align( x264_slicetype_analyse, h, 1 );
+
+        x264_lookahead_encoder_shift( h );
+    }
+}
index ca19d64a82460870158794fe1ea0a4f4f1b6323c..cb7fd3b86dc3c0d1d507eaf15255e7db2553265f 100644 (file)
@@ -922,11 +922,7 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp )
     }
 
     if( h->sh.i_type != SLICE_TYPE_B )
-    {
-        rc->bframes = 0;
-        while( h->frames.current[rc->bframes] && IS_X264_TYPE_B(h->frames.current[rc->bframes]->i_type) )
-            rc->bframes++;
-    }
+        rc->bframes = h->fenc->i_bframes;
 
     if( i_force_qp )
     {
@@ -1250,7 +1246,7 @@ int x264_ratecontrol_end( x264_t *h, int bits )
         if( h->sh.i_type == SLICE_TYPE_B )
         {
             rc->bframe_bits += bits;
-            if( !h->frames.current[0] || !IS_X264_TYPE_B(h->frames.current[0]->i_type) )
+            if( h->fenc->b_last_minigop_bframe )
             {
                 update_predictor( rc->pred_b_from_p, qp2qscale(rc->qpa_rc),
                                   h->fref1[h->i_ref1-1]->i_satd, rc->bframe_bits / rc->bframes );
index af74427de5655644b76c914960072218c0bcdbb4..88aff91b36c1e87a6f411919475c90dfe92e34d9 100644 (file)
@@ -625,7 +625,7 @@ static int x264_slicetype_path_cost( x264_t *h, x264_mb_analysis_t *a, x264_fram
 /* Uses strings due to the fact that the speed of the control functions is
    negligable compared to the cost of running slicetype_frame_cost, and because
    it makes debugging easier. */
-static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int max_bframes, int buffer_size, char (*best_paths)[X264_LOOKAHEAD_MAX] )
+static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int max_bframes, char (*best_paths)[X264_LOOKAHEAD_MAX] )
 {
     char paths[X264_BFRAME_MAX+2][X264_LOOKAHEAD_MAX] = {{0}};
     int num_paths = X264_MIN(max_bframes+1, length);
@@ -666,7 +666,7 @@ static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, in
     int icost = frame->i_cost_est[0][0];
     int pcost = frame->i_cost_est[p1-p0][0];
     float f_bias;
-    int i_gop_size = frame->i_frame - h->frames.i_last_idr;
+    int i_gop_size = frame->i_frame - h->lookahead->i_last_idr;
     float f_thresh_max = h->param.i_scenecut_threshold / 100.0;
     /* magic numbers pulled out of thin air */
     float f_thresh_min = f_thresh_max * h->param.i_keyint_min
@@ -700,33 +700,33 @@ static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, in
     return res;
 }
 
-static void x264_slicetype_analyse( x264_t *h, int keyframe )
+void x264_slicetype_analyse( x264_t *h, int keyframe )
 {
     x264_mb_analysis_t a;
     x264_frame_t *frames[X264_LOOKAHEAD_MAX+3] = { NULL, };
-    int num_frames;
-    int keyint_limit;
-    int i,j;
+    int num_frames, keyint_limit, idr_frame_type, i, j;
     int i_mb_count = NUM_MBS;
     int cost1p0, cost2p0, cost1b1, cost2p1;
-    int idr_frame_type;
+    int i_max_search = X264_MIN( h->lookahead->next.i_size, X264_LOOKAHEAD_MAX );
+    if( h->param.b_deterministic )
+        i_max_search = X264_MIN( i_max_search, h->lookahead->i_slicetype_length + !keyframe );
 
     assert( h->frames.b_have_lowres );
 
-    if( !h->frames.last_nonb )
+    if( !h->lookahead->last_nonb )
         return;
-    frames[0] = h->frames.last_nonb;
-    for( j = 0; h->frames.next[j] && h->frames.next[j]->i_type == X264_TYPE_AUTO; j++ )
-        frames[j+1] = h->frames.next[j];
+    frames[0] = h->lookahead->last_nonb;
+    for( j = 0; j < i_max_search && h->lookahead->next.list[j]->i_type == X264_TYPE_AUTO; j++ )
+        frames[j+1] = h->lookahead->next.list[j];
 
     if( !j )
         return;
 
-    keyint_limit = h->param.i_keyint_max - frames[0]->i_frame + h->frames.i_last_idr - 1;
+    keyint_limit = h->param.i_keyint_max - frames[0]->i_frame + h->lookahead->i_last_idr - 1;
     num_frames = X264_MIN( j, keyint_limit );
 
     x264_lowres_context_init( h, &a );
-    idr_frame_type = frames[1]->i_frame - h->frames.i_last_idr >= h->param.i_keyint_min ? X264_TYPE_IDR : X264_TYPE_I;
+    idr_frame_type = frames[1]->i_frame - h->lookahead->i_last_idr >= h->param.i_keyint_min ? X264_TYPE_IDR : X264_TYPE_I;
 
     /* This is important psy-wise: if we have a non-scenecut keyframe,
      * there will be significant visual artifacts if the frames just before
@@ -765,7 +765,7 @@ static void x264_slicetype_analyse( x264_t *h, int keyframe )
         {
             /* Perform the frametype analysis. */
             for( n = 2; n < num_frames-1; n++ )
-                x264_slicetype_path( h, &a, frames, n, max_bframes, num_frames-max_bframes, best_paths );
+                x264_slicetype_path( h, &a, frames, n, max_bframes, best_paths );
             if( num_frames > 1 )
             {
                 num_bframes = strspn( best_paths[num_frames-2], "B" );
@@ -888,15 +888,15 @@ void x264_slicetype_decide( x264_t *h )
     int bframes;
     int i;
 
-    if( h->frames.next[0] == NULL )
+    if( !h->lookahead->next.i_size )
         return;
 
     if( h->param.rc.b_stat_read )
     {
         /* Use the frame types from the first pass */
-        for( i = 0; h->frames.next[i] != NULL; i++ )
-            h->frames.next[i]->i_type =
-                x264_ratecontrol_slice_type( h, h->frames.next[i]->i_frame );
+        for( i = 0; i < h->lookahead->next.i_size; i++ )
+            h->lookahead->next.list[i]->i_type =
+                x264_ratecontrol_slice_type( h, h->lookahead->next.list[i]->i_frame );
     }
     else if( (h->param.i_bframe && h->param.i_bframe_adaptive)
              || h->param.i_scenecut_threshold
@@ -906,10 +906,10 @@ void x264_slicetype_decide( x264_t *h )
 
     for( bframes = 0;; bframes++ )
     {
-        frm = h->frames.next[bframes];
+        frm = h->lookahead->next.list[bframes];
 
         /* Limit GOP size */
-        if( frm->i_frame - h->frames.i_last_idr >= h->param.i_keyint_max )
+        if( frm->i_frame - h->lookahead->i_last_idr >= h->param.i_keyint_max )
         {
             if( frm->i_type == X264_TYPE_AUTO )
                 frm->i_type = X264_TYPE_IDR;
@@ -919,19 +919,16 @@ void x264_slicetype_decide( x264_t *h )
         if( frm->i_type == X264_TYPE_IDR )
         {
             /* Close GOP */
+            h->lookahead->i_last_idr = frm->i_frame;
             if( bframes > 0 )
             {
                 bframes--;
-                h->frames.next[bframes]->i_type = X264_TYPE_P;
-            }
-            else
-            {
-                h->i_frame_num = 0;
+                h->lookahead->next.list[bframes]->i_type = X264_TYPE_P;
             }
         }
 
-        if( bframes == h->param.i_bframe
-            || h->frames.next[bframes+1] == NULL )
+        if( bframes == h->param.i_bframe ||
+            !h->lookahead->next.list[bframes+1] )
         {
             if( IS_X264_TYPE_B( frm->i_type ) )
                 x264_log( h, X264_LOG_WARNING, "specified frame type is not compatible with max B-frames\n" );
@@ -945,45 +942,47 @@ void x264_slicetype_decide( x264_t *h )
 
         else if( !IS_X264_TYPE_B( frm->i_type ) ) break;
     }
+
+    if( bframes )
+        h->lookahead->next.list[bframes-1]->b_last_minigop_bframe = 1;
+    h->lookahead->next.list[bframes]->i_bframes = bframes;
+
+    /* calculate the frame costs ahead of time for x264_rc_analyse_slice while we still have lowres */
+    if( h->param.rc.i_rc_method != X264_RC_CQP )
+    {
+        x264_mb_analysis_t a;
+        x264_frame_t *frames[X264_BFRAME_MAX+2] = { NULL, };
+        int p0=0, p1, b;
+
+        x264_lowres_context_init( h, &a );
+
+        if( IS_X264_TYPE_I( h->lookahead->next.list[bframes]->i_type ) )
+            p1 = b = 0;
+        else // P
+            p1 = b = bframes + 1;
+        frames[p0] = h->lookahead->last_nonb;
+        frames[b] = h->lookahead->next.list[bframes];
+
+        x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 );
+    }
 }
 
 int x264_rc_analyse_slice( x264_t *h )
 {
-    x264_mb_analysis_t a;
-    x264_frame_t *frames[X264_LOOKAHEAD_MAX+2] = { NULL, };
+    x264_frame_t *frames[X264_BFRAME_MAX+2] = { NULL, };
     int p0=0, p1, b;
     int cost;
 
-    x264_lowres_context_init( h, &a );
-
     if( IS_X264_TYPE_I(h->fenc->i_type) )
-    {
         p1 = b = 0;
-        /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
-        if( h->param.rc.b_mb_tree || (h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead) )
-        {
-            h->frames.last_nonb = h->fenc;
-            x264_slicetype_analyse( h, 1 );
-        }
-    }
-    else if( X264_TYPE_P == h->fenc->i_type )
-    {
-        p1 = 0;
-        while( h->frames.current[p1] && IS_X264_TYPE_B( h->frames.current[p1]->i_type ) )
-            p1++;
-        p1++;
-        b = p1;
-    }
-    else //B
-    {
-        p1 = (h->fref1[0]->i_poc - h->fref0[0]->i_poc)/2;
-        b  = (h->fref1[0]->i_poc - h->fenc->i_poc)/2;
-        frames[p1] = h->fref1[0];
-    }
+    else // P
+        p1 = b = h->fenc->i_bframes + 1;
     frames[p0] = h->fref0[0];
     frames[b] = h->fenc;
 
-    cost = x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 );
+    /* cost should have been already calculated by x264_slicetype_decide */
+    cost = frames[b]->i_cost_est[b-p0][p1-b];
+    assert( cost >= 0 );
 
     if( h->param.rc.b_mb_tree && !h->param.rc.b_stat_read )
         cost = x264_slicetype_frame_cost_recalculate( h, frames, p0, p1, b );
diff --git a/x264.c b/x264.c
index 76e53072d231c3862b267fa38cd48ceaf3cabc8a..0ed538a003e75b238d13a9ce8a78174771b34c68 100644 (file)
--- a/x264.c
+++ b/x264.c
@@ -355,6 +355,7 @@ static void Help( x264_param_t *defaults, int b_longhelp )
     H0( "      --ssim                  Enable SSIM computation\n" );
     H0( "      --threads <integer>     Force a specific number of threads\n" );
     H1( "      --thread-input          Run Avisynth in its own thread\n" );
+    H1( "      --sync-lookahead <integer> Number of buffer frames for threaded lookahead\n" );
     H1( "      --non-deterministic     Slightly improve quality of SMP, at the cost of repeatability\n" );
     H1( "      --asm <integer>         Override CPU detection\n" );
     H1( "      --no-asm                Disable all CPU optimizations\n" );
@@ -467,6 +468,7 @@ static struct option long_options[] =
     { "slice-max-mbs",     required_argument, NULL, 0 },
     { "slices",            required_argument, NULL, 0 },
     { "thread-input",      no_argument, NULL, OPT_THREAD_INPUT },
+    { "sync-lookahead",    required_argument, NULL, 0 },
     { "non-deterministic", no_argument, NULL, 0 },
     { "psnr",              no_argument, NULL, 0 },
     { "ssim",              no_argument, NULL, 0 },
@@ -988,7 +990,7 @@ generic_option:
 
 #ifdef HAVE_PTHREAD
     if( b_thread_input || param->i_threads > 1
-        || (param->i_threads == 0 && x264_cpu_num_processors() > 1) )
+        || (param->i_threads == X264_THREADS_AUTO && x264_cpu_num_processors() > 1) )
     {
         if( open_file_thread( NULL, &opt->hin, param ) )
         {
diff --git a/x264.h b/x264.h
index 5e6d411788a64bc16da0d7fa11aecd92a0f1d018..66f4f282b151982532c41800f3956685c8de6f99 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@
 
 #include <stdarg.h>
 
-#define X264_BUILD 74
+#define X264_BUILD 75
 
 /* x264_t:
  *      opaque handler for encoder */
@@ -139,6 +139,7 @@ static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", ""
 
 /* Threading */
 #define X264_THREADS_AUTO 0 /* Automatically select optimal number of threads */
+#define X264_SYNC_LOOKAHEAD_AUTO -1 /* Automatically select optimal lookahead thread buffer size */
 
 /* Zones: override ratecontrol or other options for specific sections of the video.
  * See x264_encoder_reconfig() for which options can be changed.
@@ -158,6 +159,7 @@ typedef struct x264_param_t
     unsigned int cpu;
     int         i_threads;       /* encode multiple frames in parallel */
     int         b_deterministic; /* whether to allow non-deterministic optimizations when threaded */
+    int         i_sync_lookahead; /* threaded lookahead buffer */
 
     /* Video Properties */
     int         i_width;