Threaded lookahead

author Steven Walters <kemuri9@gmail.com>

Wed, 2 Sep 2009 01:46:51 +0000 (18:46 -0700)

committer Fiona Glaser <fiona@x264.com>

Wed, 2 Sep 2009 04:06:20 +0000 (21:06 -0700)
author Steven Walters <kemuri9@gmail.com>
Wed, 2 Sep 2009 01:46:51 +0000 (18:46 -0700)
committer Fiona Glaser <fiona@x264.com>
Wed, 2 Sep 2009 04:06:20 +0000 (21:06 -0700)
diff --git a/Makefile b/Makefile

index 0f34736ee1a981c6629a9ba0555e1e206d5d065d..04d639e8c10ba55bfc463ee4d613ea56cb4a3951 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -10,7 +10,7 @@ SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
         common/quant.c common/vlc.c \
         encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
         encoder/set.c encoder/macroblock.c encoder/cabac.c \
-       encoder/cavlc.c encoder/encoder.c
+       encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
  
  SRCCLI = x264.c matroska.c muxers.c
  
diff --git a/common/common.c b/common/common.c

index c0a56e3d79bf7774913a76a5e1ce48dc48da0f50..461738700faf1596fcedb519539431d78c76a5f2 100644 (file)
--- a/common/common.c
+++ b/common/common.c
@@ -45,6 +45,7 @@ void    x264_param_default( x264_param_t *param )
      param->cpu = x264_cpu_detect();
      param->i_threads = X264_THREADS_AUTO;
      param->b_deterministic = 1;
+    param->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
  
      /* Video properties */
      param->i_csp           = X264_CSP_I420;
@@ -276,6 +277,13 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
          else
              p->i_threads = atoi(value);
      }
+    OPT("sync-lookahead")
+    {
+        if( !strcmp(value, "auto") )
+            p->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
+        else
+            p->i_sync_lookahead = atoi(value);
+    }
      OPT2("deterministic", "n-deterministic")
          p->b_deterministic = atobool(value);
      OPT2("level", "level-idc")
diff --git a/common/common.h b/common/common.h

index 574040d0f6dff90900403548aae5be3dd06c34c5..81c7b003334e50bfee0e5a0fe93d504116936aa4 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -239,6 +239,19 @@ typedef struct
  
  } x264_slice_header_t;
  
+typedef struct x264_lookahead_t
+{
+    uint8_t                       b_thread_active;
+    uint8_t                       b_exit_thread;
+    uint8_t                       b_analyse_keyframe;
+    int                           i_last_idr;
+    int                           i_slicetype_length;
+    x264_frame_t                  *last_nonb;
+    x264_synch_frame_list_t       ifbuf;
+    x264_synch_frame_list_t       next;
+    x264_synch_frame_list_t       ofbuf;
+} x264_lookahead_t;
+
  /* From ffmpeg
   */
  #define X264_SCAN8_SIZE (6*8)
@@ -283,7 +296,7 @@ struct x264_t
      /* encoder parameters */
      x264_param_t    param;
  
-    x264_t          *thread[X264_THREAD_MAX];
+    x264_t          *thread[X264_THREAD_MAX+1];
      x264_pthread_t  thread_handle;
      int             b_thread_active;
      int             i_thread_phase; /* which thread to use for the next frame */
@@ -349,13 +362,9 @@ struct x264_t
      struct
      {
          /* Frames to be encoded (whose types have been decided) */
-        x264_frame_t *current[X264_LOOKAHEAD_MAX+3];
-        /* Temporary buffer (frames types not yet decided) */
-        x264_frame_t *next[X264_LOOKAHEAD_MAX+3];
-        /* Unused frames */
-        x264_frame_t *unused[X264_LOOKAHEAD_MAX + X264_THREAD_MAX*2 + 16+4];
-        /* For adaptive B decision */
-        x264_frame_t *last_nonb;
+        x264_frame_t **current;
+        /* Unused frames: 0 = fenc, 1 = fdec */
+        x264_frame_t **unused[2];
  
          /* frames used for reference + sentinels */
          x264_frame_t *reference[16+2];
@@ -667,6 +676,7 @@ struct x264_t
  #if VISUALIZE
      struct visualize_t *visualize;
  #endif
+    x264_lookahead_t *lookahead;
  };
  
  // included at the end because it needs x264_t
diff --git a/common/cpu.h b/common/cpu.h

index 4380a3598b5860c8f6f3d2fcbd881c1816f44090..6901e1e18c99f8be9e6177b2b147c040a649e65a 100644 (file)
--- a/common/cpu.h
+++ b/common/cpu.h
@@ -33,12 +33,12 @@ void     x264_cpu_mask_misalign_sse( void );
   * gcc 4.2 introduced __attribute__((force_align_arg_pointer)) to fix this
   * problem, but I don't want to require such a new version.
   * This applies only to x86_32, since other architectures that need alignment
- * also have ABIs that ensure aligned stack. */
+ * either have ABIs that ensure aligned stack, or don't support it at all. */
  #if defined(ARCH_X86) && defined(HAVE_MMX)
-int x264_stack_align( void (*func)(x264_t*), x264_t *arg );
-#define x264_stack_align(func,arg) x264_stack_align((void (*)(x264_t*))func,arg)
+int x264_stack_align( void (*func)(), ... );
+#define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__)
  #else
-#define x264_stack_align(func,arg) func(arg)
+#define x264_stack_align(func,...) func(__VA_ARGS__)
  #endif
  
  typedef struct {
diff --git a/common/frame.c b/common/frame.c

index 99052602a07f871aca123b1ebdab4ce7991a894c..001c4fd9b7e20402cb74c0c7a6b316738e50ba64 100644 (file)
--- a/common/frame.c
+++ b/common/frame.c
@@ -26,7 +26,7 @@
  
  #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
  
-x264_frame_t *x264_frame_new( x264_t *h )
+x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
  {
      x264_frame_t *frame;
      int i, j;
@@ -60,9 +60,23 @@ x264_frame_t *x264_frame_new( x264_t *h )
          CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
          frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
      }
+
+    for( i = 0; i < h->param.i_bframe + 2; i++ )
+        for( j = 0; j < h->param.i_bframe + 2; j++ )
+            CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
+
+    frame->i_poc = -1;
+    frame->i_type = X264_TYPE_AUTO;
+    frame->i_qpplus1 = 0;
+    frame->i_pts = -1;
+    frame->i_frame = -1;
+    frame->i_frame_num = -1;
+    frame->i_lines_completed = -1;
+    frame->b_fdec = b_fdec;
+
      /* all 4 luma planes allocated together, since the cacheline split code
       * requires them to be in-phase wrt cacheline alignment. */
-    if( h->param.analyse.i_subpel_refine )
+    if( h->param.analyse.i_subpel_refine && b_fdec )
      {
          CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
          for( i = 0; i < 4; i++ )
@@ -75,77 +89,68 @@ x264_frame_t *x264_frame_new( x264_t *h )
          frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
      }
  
-    if( h->frames.b_have_lowres )
+    if( b_fdec ) /* fdec frame */
      {
-        frame->i_width_lowres = frame->i_width[0]/2;
-        frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
-        frame->i_lines_lowres = frame->i_lines[0]/2;
-
-        luma_plane_size = frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv );
-
-        CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
-        for( i = 0; i < 4; i++ )
-            frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
-
-        for( j = 0; j <= !!h->param.i_bframe; j++ )
-            for( i = 0; i <= h->param.i_bframe; i++ )
-            {
-                CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
-                CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
-            }
-        CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
-        for( j = 0; j <= h->param.i_bframe+1; j++ )
-            for( i = 0; i <= h->param.i_bframe+1; i++ )
-            {
-                CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
-                CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
-            }
-        frame->i_intra_cost = frame->lowres_costs[0][0];
-        memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
+        CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
+        CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
+        CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
+        if( h->param.i_bframe )
+        {
+            CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
+            CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
+        }
+        else
+        {
+            frame->mv[1]  = NULL;
+            frame->ref[1] = NULL;
+        }
+        CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
+        CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
+        if( h->param.analyse.i_me_method >= X264_ME_ESA )
+        {
+            CHECKED_MALLOC( frame->buffer[3],
+                            frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
+            frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
+        }
      }
-
-    if( h->param.analyse.i_me_method >= X264_ME_ESA )
+    else /* fenc frame */
      {
-        CHECKED_MALLOC( frame->buffer[3],
-                        frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
-        frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
-    }
-
-    frame->i_poc = -1;
-    frame->i_type = X264_TYPE_AUTO;
-    frame->i_qpplus1 = 0;
-    frame->i_pts = -1;
-    frame->i_frame = -1;
-    frame->i_frame_num = -1;
-    frame->i_lines_completed = -1;
+        if( h->frames.b_have_lowres )
+        {
+            frame->i_width_lowres = frame->i_width[0]/2;
+            frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
+            frame->i_lines_lowres = frame->i_lines[0]/2;
  
-    CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
-    CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
-    CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
-    if( h->param.i_bframe )
-    {
-        CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
-        CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
-    }
-    else
-    {
-        frame->mv[1]  = NULL;
-        frame->ref[1] = NULL;
-    }
+            luma_plane_size = frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv );
  
-    CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
-    CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
-    for( i = 0; i < h->param.i_bframe + 2; i++ )
-        for( j = 0; j < h->param.i_bframe + 2; j++ )
-            CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
+            CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
+            for( i = 0; i < 4; i++ )
+                frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
  
-    if( h->param.rc.i_aq_mode )
-    {
-        CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
-        CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
-        if( h->frames.b_have_lowres )
-            /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
-            CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
+            for( j = 0; j <= !!h->param.i_bframe; j++ )
+                for( i = 0; i <= h->param.i_bframe; i++ )
+                {
+                    CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
+                    CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
+                }
+            CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
+            for( j = 0; j <= h->param.i_bframe+1; j++ )
+                for( i = 0; i <= h->param.i_bframe+1; i++ )
+                {
+                    CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
+                    CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
+                }
+            frame->i_intra_cost = frame->lowres_costs[0][0];
+            memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
+        }
+        if( h->param.rc.i_aq_mode )
+        {
+            CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
+            CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
+            if( h->frames.b_have_lowres )
+                /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
+                CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
+        }
      }
  
      if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
@@ -971,19 +976,19 @@ void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
      assert( frame->i_reference_count > 0 );
      frame->i_reference_count--;
      if( frame->i_reference_count == 0 )
-        x264_frame_push( h->frames.unused, frame );
-    assert( h->frames.unused[ sizeof(h->frames.unused) / sizeof(*h->frames.unused) - 1 ] == NULL );
+        x264_frame_push( h->frames.unused[frame->b_fdec], frame );
  }
  
-x264_frame_t *x264_frame_pop_unused( x264_t *h )
+x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
  {
      x264_frame_t *frame;
-    if( h->frames.unused[0] )
-        frame = x264_frame_pop( h->frames.unused );
+    if( h->frames.unused[b_fdec][0] )
+        frame = x264_frame_pop( h->frames.unused[b_fdec] );
      else
-        frame = x264_frame_new( h );
+        frame = x264_frame_new( h, b_fdec );
      if( !frame )
          return NULL;
+    frame->b_last_minigop_bframe = 0;
      frame->i_reference_count = 1;
      frame->b_intra_calculated = 0;
      return frame;
@@ -1008,3 +1013,54 @@ void x264_frame_sort( x264_frame_t **list, int b_dts )
          }
      } while( !b_ok );
  }
+
+void x264_frame_delete_list( x264_frame_t **list )
+{
+    int i = 0;
+    while( list[i] )
+        x264_frame_delete( list[i++] );
+    x264_free( list );
+}
+
+int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int max_size )
+{
+    if( max_size < 0 )
+        return -1;
+    slist->i_max_size = max_size;
+    slist->i_size = 0;
+    CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
+    if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
+        x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
+        x264_pthread_cond_init( &slist->cv_empty, NULL ) )
+        return -1;
+    return 0;
+fail:
+    return -1;
+}
+
+void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist )
+{
+    x264_pthread_mutex_destroy( &slist->mutex );
+    x264_pthread_cond_destroy( &slist->cv_fill );
+    x264_pthread_cond_destroy( &slist->cv_empty );
+    x264_frame_delete_list( slist->list );
+}
+
+void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame )
+{
+    x264_pthread_mutex_lock( &slist->mutex );
+    while( slist->i_size == slist->i_max_size )
+        x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
+    slist->list[ slist->i_size++ ] = frame;
+    x264_pthread_mutex_unlock( &slist->mutex );
+    x264_pthread_cond_broadcast( &slist->cv_fill );
+}
+
+int x264_synch_frame_list_get_size( x264_synch_frame_list_t *slist )
+{
+    int size;
+    x264_pthread_mutex_lock( &slist->mutex );
+    size = slist->i_size;
+    x264_pthread_mutex_unlock( &slist->mutex );
+    return size;
+}
diff --git a/common/frame.h b/common/frame.h

index 9ca83f9391a3ce92d5d3fd5ae1be2add43a9d69a..f6faa12b003dd89635dfdcdf1192fcec96b345a1 100644 (file)
--- a/common/frame.h
+++ b/common/frame.h
@@ -40,6 +40,9 @@ typedef struct
      int     i_frame;    /* Presentation frame number */
      int     i_frame_num; /* Coded frame number */
      int     b_kept_as_ref;
+    uint8_t b_fdec;
+    uint8_t b_last_minigop_bframe; /* this frame is the last b in a sequence of bframes */
+    uint8_t i_bframes;   /* number of bframes following this nonb in coded order */
      float   f_qp_avg_rc; /* QPs as decided by ratecontrol */
      float   f_qp_avg_aq; /* QPs as decided by AQ in addition to ratecontrol */
  
@@ -104,6 +107,17 @@ typedef struct
  
  } x264_frame_t;
  
+/* synchronized frame list */
+typedef struct
+{
+   x264_frame_t **list;
+   int i_max_size;
+   int i_size;
+   x264_pthread_mutex_t     mutex;
+   x264_pthread_cond_t      cv_fill;  /* event signaling that the list became fuller */
+   x264_pthread_cond_t      cv_empty; /* event signaling that the list became emptier */
+} x264_synch_frame_list_t;
+
  typedef void (*x264_deblock_inter_t)( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
  typedef void (*x264_deblock_intra_t)( uint8_t *pix, int stride, int alpha, int beta );
  typedef struct
@@ -118,7 +132,7 @@ typedef struct
      x264_deblock_intra_t deblock_h_chroma_intra;
  } x264_deblock_function_t;
  
-x264_frame_t *x264_frame_new( x264_t *h );
+x264_frame_t *x264_frame_new( x264_t *h, int b_fdec );
  void          x264_frame_delete( x264_frame_t *frame );
  
  int           x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );
@@ -144,8 +158,15 @@ x264_frame_t *x264_frame_pop( x264_frame_t **list );
  void          x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame );
  x264_frame_t *x264_frame_shift( x264_frame_t **list );
  void          x264_frame_push_unused( x264_t *h, x264_frame_t *frame );
-x264_frame_t *x264_frame_pop_unused( x264_t *h );
+x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec );
  void          x264_frame_sort( x264_frame_t **list, int b_dts );
+void          x264_frame_delete_list( x264_frame_t **list );
+
+int           x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int nelem );
+void          x264_synch_frame_list_delete( x264_synch_frame_list_t *slist );
+void          x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame );
+int           x264_synch_frame_list_get_size( x264_synch_frame_list_t *slist );
+
  #define x264_frame_sort_dts(list) x264_frame_sort(list, 1)
  #define x264_frame_sort_pts(list) x264_frame_sort(list, 0)
  
diff --git a/common/macroblock.c b/common/macroblock.c

index 790dde22b82e499da96b377198670c99b9ff52ed..6e866d4c04ed7e52730ef35991d92a66db4fe91a 100644 (file)
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -703,7 +703,7 @@ int x264_macroblock_cache_init( x264_t *h )
          for( j=0; j<3; j++ )
          {
              /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
-            CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], h->fdec->i_stride[j] );
+            CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
              h->mb.intra_border_backup[i][j] += 8;
          }
  
diff --git a/common/osdep.h b/common/osdep.h

index 696bbc9d015a602ee81de8abceffdf79f5b3b29e..9d6a1e635bc2942b0d6bc687e954dc900b8a8ebf 100644 (file)
--- a/common/osdep.h
+++ b/common/osdep.h
@@ -137,6 +137,9 @@ static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(vo
  #define x264_pthread_cond_destroy    pthread_cond_destroy
  #define x264_pthread_cond_broadcast  pthread_cond_broadcast
  #define x264_pthread_cond_wait       pthread_cond_wait
+#define x264_pthread_attr_t          pthread_attr_t
+#define x264_pthread_attr_init       pthread_attr_init
+#define x264_pthread_attr_destroy    pthread_attr_destroy
  #else
  #define x264_pthread_mutex_t         int
  #define x264_pthread_mutex_init(m,f) 0
@@ -148,6 +151,9 @@ static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(vo
  #define x264_pthread_cond_destroy(c)
  #define x264_pthread_cond_broadcast(c)
  #define x264_pthread_cond_wait(c,m)
+#define x264_pthread_attr_t          int
+#define x264_pthread_attr_init(a)    0
+#define x264_pthread_attr_destroy(a)
  #endif
  
  #define WORD_SIZE sizeof(void*)
@@ -216,4 +222,11 @@ static int ALWAYS_INLINE x264_clz( uint32_t x )
  }
  #endif
  
+#if defined(SYS_LINUX) && defined(HAVE_PTHREAD)
+#include <unistd.h>
+#define x264_lower_thread_priority(p) { UNUSED int nice_ret = nice(p); }
+#else
+#define x264_lower_thread_priority(p)
+#endif
+
  #endif /* X264_OSDEP_H */
diff --git a/common/x86/cpu-a.asm b/common/x86/cpu-a.asm

index 2df98fd47bf340dd4bcaa4428fd93b63ac32446e..285111a9c91417d011ee83e99a6ab290c82f91fe 100644 (file)
--- a/common/x86/cpu-a.asm
+++ b/common/x86/cpu-a.asm
@@ -96,11 +96,13 @@ cglobal x264_cpu_cpuid, 0,6
  cglobal x264_stack_align
      push ebp
      mov  ebp, esp
-    sub  esp, 4
+    sub  esp, 8
      and  esp, ~15
      mov  ecx, [ebp+8]
      mov  edx, [ebp+12]
      mov  [esp], edx
+    mov  edx, [ebp+16]
+    mov  [esp+4], edx
      call ecx
      leave
      ret
diff --git a/encoder/analyse.h b/encoder/analyse.h

index a2a04a557670caed15bc84220612e72e8428fbe0..05aae40d00546e03a5a6ba15335a507e8173c2ac 100644 (file)
--- a/encoder/analyse.h
+++ b/encoder/analyse.h
@@ -28,4 +28,12 @@ int  x264_macroblock_analyse( x264_t *h );
  void x264_slicetype_decide( x264_t *h );
  int  x264_lowres_context_alloc( x264_t *h );
  
+void x264_slicetype_analyse( x264_t *h, int keyframe );
+
+int  x264_lookahead_init( x264_t *h, int i_slicetype_length );
+int  x264_lookahead_is_empty( x264_t *h );
+void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame );
+void x264_lookahead_get_frames( x264_t *h );
+void x264_lookahead_delete( x264_t *h );
+
  #endif
diff --git a/encoder/encoder.c b/encoder/encoder.c

index eb6c435040c8f6465c13b475b065a15d08e2072f..c6b3398049cfd284a7b0e1368347f3f897bd97bd 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -364,7 +364,7 @@ static int x264_validate_parameters( x264_t *h )
          return -1;
      }
  
-    if( h->param.i_threads == 0 )
+    if( h->param.i_threads == X264_THREADS_AUTO )
          h->param.i_threads = x264_cpu_num_processors() * 3/2;
      h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
      if( h->param.i_threads > 1 )
@@ -519,6 +519,14 @@ static int x264_validate_parameters( x264_t *h )
          h->param.rc.b_mb_tree = 0;
      if( h->param.rc.f_qcompress == 1 )
          h->param.rc.b_mb_tree = 0;
+#ifdef HAVE_PTHREAD
+    if( h->param.i_sync_lookahead )
+        h->param.i_sync_lookahead = x264_clip3( h->param.i_sync_lookahead, h->param.i_threads + h->param.i_bframe, X264_LOOKAHEAD_MAX );
+    if( h->param.rc.b_stat_read || h->param.i_threads == 1 )
+        h->param.i_sync_lookahead = 0;
+#else
+    h->param.i_sync_lookahead = 0;
+#endif
  
      h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
                                  && h->param.i_bframe
@@ -740,7 +748,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
  {
      x264_t *h;
      char buf[1000], *p;
-    int i;
+    int i, i_slicetype_length;
  
      CHECKED_MALLOCZERO( h, sizeof(x264_t) );
  
@@ -793,8 +801,10 @@ x264_t *x264_encoder_open( x264_param_t *param )
          h->frames.i_delay = h->param.i_bframe;
      if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size )
          h->frames.i_delay = X264_MAX( h->frames.i_delay, h->param.rc.i_lookahead );
+    i_slicetype_length = h->frames.i_delay;
      h->frames.i_delay += h->param.i_threads - 1;
      h->frames.i_delay = X264_MIN( h->frames.i_delay, X264_LOOKAHEAD_MAX );
+    h->frames.i_delay += h->param.i_sync_lookahead;
  
      h->frames.i_max_ref0 = h->param.i_frame_reference;
      h->frames.i_max_ref1 = h->sps->vui.i_num_reorder_frames;
@@ -810,7 +820,12 @@ x264_t *x264_encoder_open( x264_param_t *param )
  
      h->frames.i_last_idr = - h->param.i_keyint_max;
      h->frames.i_input    = 0;
-    h->frames.last_nonb  = NULL;
+
+    CHECKED_MALLOCZERO( h->frames.unused[0], (h->frames.i_delay + 3) * sizeof(x264_frame_t *) );
+    /* Allocate room for max refs plus a few extra just in case. */
+    CHECKED_MALLOCZERO( h->frames.unused[1], (h->param.i_threads + 20) * sizeof(x264_frame_t *) );
+    CHECKED_MALLOCZERO( h->frames.current, (h->param.i_sync_lookahead + h->param.i_bframe
+                        + h->param.i_threads + 3) * sizeof(x264_frame_t *) );
  
      h->i_ref0 = 0;
      h->i_ref1 = 0;
@@ -861,14 +876,14 @@ x264_t *x264_encoder_open( x264_param_t *param )
  
      h->thread[0] = h;
      h->i_thread_num = 0;
-    for( i = 1; i < h->param.i_threads; i++ )
+    for( i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
          CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
  
      for( i = 0; i < h->param.i_threads; i++ )
      {
          if( i > 0 )
              *h->thread[i] = *h;
-        h->thread[i]->fdec = x264_frame_pop_unused( h );
+        h->thread[i]->fdec = x264_frame_pop_unused( h, 1 );
          if( !h->thread[i]->fdec )
              goto fail;
          CHECKED_MALLOC( h->thread[i]->out.p_bitstream, h->out.i_bitstream );
@@ -879,6 +894,9 @@ x264_t *x264_encoder_open( x264_param_t *param )
              goto fail;
      }
  
+    if( x264_lookahead_init( h, i_slicetype_length ) )
+        goto fail;
+
      if( x264_ratecontrol_new( h ) < 0 )
          goto fail;
  
@@ -1181,8 +1199,6 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
  
  static inline int x264_reference_update( x264_t *h )
  {
-    int i;
-
      if( h->fdec->i_frame >= 0 )
          h->i_frame++;
  
@@ -1191,29 +1207,18 @@ static inline int x264_reference_update( x264_t *h )
          if( h->param.i_threads > 1 )
          {
              x264_frame_push_unused( h, h->fdec );
-            h->fdec = x264_frame_pop_unused( h );
+            h->fdec = x264_frame_pop_unused( h, 1 );
              if( !h->fdec )
                  return -1;
          }
          return 0;
      }
  
-    /* move lowres copy of the image to the ref frame */
-    for( i = 0; i < 4; i++)
-    {
-        XCHG( uint8_t*, h->fdec->lowres[i], h->fenc->lowres[i] );
-        XCHG( uint8_t*, h->fdec->buffer_lowres[i], h->fenc->buffer_lowres[i] );
-    }
-
-    /* adaptive B decision needs a pointer, since it can't use the ref lists */
-    if( h->sh.i_type != SLICE_TYPE_B )
-        h->frames.last_nonb = h->fdec;
-
      /* move frame in the buffer */
      x264_frame_push( h->frames.reference, h->fdec );
      if( h->frames.reference[h->frames.i_max_dpb] )
          x264_frame_push_unused( h, x264_frame_shift( h->frames.reference ) );
-    h->fdec = x264_frame_pop_unused( h );
+    h->fdec = x264_frame_pop_unused( h, 1 );
      if( !h->fdec )
          return -1;
      return 0;
@@ -1516,6 +1521,8 @@ static void *x264_slices_write( x264_t *h )
  {
      int i_frame_size = 0;
      int i_slice_num = 0;
+    if( h->param.i_sync_lookahead )
+        x264_lower_thread_priority( 10 );
  
  #ifdef HAVE_MMX
      /* Misalign mask has to be set separately for each thread. */
@@ -1619,7 +1626,7 @@ int     x264_encoder_encode( x264_t *h,
      if( pic_in != NULL )
      {
          /* 1: Copy the picture to a frame and move it to a buffer */
-        x264_frame_t *fenc = x264_frame_pop_unused( h );
+        x264_frame_t *fenc = x264_frame_pop_unused( h, 0 );
          if( !fenc )
              return -1;
  
@@ -1632,8 +1639,6 @@ int     x264_encoder_encode( x264_t *h,
  
          fenc->i_frame = h->frames.i_input++;
  
-        x264_frame_push( h->frames.next, fenc );
-
          if( h->frames.b_have_lowres )
              x264_frame_init_lowres( h, fenc );
  
@@ -1645,55 +1650,33 @@ int     x264_encoder_encode( x264_t *h,
          else if( h->param.rc.i_aq_mode )
              x264_adaptive_quant_frame( h, fenc );
  
+        /* 2: Place the frame into the queue for its slice type decision */
+        x264_lookahead_put_frame( h, fenc );
+
          if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads )
          {
-            /* Nothing yet to encode */
-            /* waiting for filling bframe buffer */
+            /* Nothing yet to encode, waiting for filling of buffers */
              pic_out->i_type = X264_TYPE_AUTO;
              return 0;
          }
      }
-
-    if( h->frames.current[0] == NULL )
+    else
      {
-        int bframes = 0;
-        /* 2: Select frame types */
-        if( h->frames.next[0] == NULL )
-        {
-            if( x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out ) < 0 )
-                return -1;
-            return 0;
-        }
+        /* signal kills for lookahead thread */
+        h->lookahead->b_exit_thread = 1;
+        x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
+    }
  
-        x264_stack_align( x264_slicetype_decide, h );
+    /* 3: The picture is analyzed in the lookahead */
+    if( !h->frames.current[0] )
+        x264_lookahead_get_frames( h );
  
-        /* 3: move some B-frames and 1 non-B to encode queue */
-        while( IS_X264_TYPE_B( h->frames.next[bframes]->i_type ) )
-            bframes++;
-        x264_frame_push( h->frames.current, x264_frame_shift( &h->frames.next[bframes] ) );
-        /* FIXME: when max B-frames > 3, BREF may no longer be centered after GOP closing */
-        if( h->param.b_bframe_pyramid && bframes > 1 )
-        {
-            x264_frame_t *mid = x264_frame_shift( &h->frames.next[bframes/2] );
-            mid->i_type = X264_TYPE_BREF;
-            x264_frame_push( h->frames.current, mid );
-            bframes--;
-        }
-        while( bframes-- )
-            x264_frame_push( h->frames.current, x264_frame_shift( h->frames.next ) );
-    }
+    if( !h->frames.current[0] && x264_lookahead_is_empty( h ) )
+        return x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
  
      /* ------------------- Get frame to be encoded ------------------------- */
      /* 4: get picture to encode */
      h->fenc = x264_frame_shift( h->frames.current );
-    if( h->fenc == NULL )
-    {
-        /* Nothing yet to encode (ex: waiting for I/P with B frames) */
-        /* waiting for filling bframe buffer */
-        pic_out->i_type = X264_TYPE_AUTO;
-        return 0;
-    }
-
      if( h->fenc->param )
      {
          x264_encoder_reconfig( h, h->fenc->param );
@@ -1704,6 +1687,7 @@ int     x264_encoder_encode( x264_t *h,
      if( h->fenc->i_type == X264_TYPE_IDR )
      {
          h->frames.i_last_idr = h->fenc->i_frame;
+        h->i_frame_num = 0;
      }
  
      /* ------------------- Setup frame context ----------------------------- */
@@ -2029,6 +2013,8 @@ void    x264_encoder_close  ( x264_t *h )
                     || h->stat.i_mb_count[SLICE_TYPE_P][I_PCM]
                     || h->stat.i_mb_count[SLICE_TYPE_B][I_PCM];
  
+    x264_lookahead_delete( h );
+
      for( i=0; i<h->param.i_threads; i++ )
      {
          // don't strictly have to wait for the other threads, but it's simpler than canceling them
@@ -2248,21 +2234,9 @@ void    x264_encoder_close  ( x264_t *h )
          h = h->thread[ h->i_thread_phase % h->param.i_threads ];
  
      /* frames */
-    for( i = 0; h->frames.current[i]; i++ )
-    {
-        assert( h->frames.current[i]->i_reference_count == 1 );
-        x264_frame_delete( h->frames.current[i] );
-    }
-    for( i = 0; h->frames.next[i]; i++ )
-    {
-        assert( h->frames.next[i]->i_reference_count == 1 );
-        x264_frame_delete( h->frames.next[i] );
-    }
-    for( i = 0; h->frames.unused[i]; i++ )
-    {
-        assert( h->frames.unused[i]->i_reference_count == 0 );
-        x264_frame_delete( h->frames.unused[i] );
-    }
+    x264_frame_delete_list( h->frames.unused[0] );
+    x264_frame_delete_list( h->frames.unused[1] );
+    x264_frame_delete_list( h->frames.current );
  
      h = h->thread[0];
  
@@ -2302,7 +2276,8 @@ int x264_encoder_delayed_frames( x264_t *h )
      h = h->thread[ h->i_thread_phase % h->param.i_threads ];
      for( i=0; h->frames.current[i]; i++ )
          delayed_frames++;
-    for( i=0; h->frames.next[i]; i++ )
-        delayed_frames++;
+    delayed_frames += x264_synch_frame_list_get_size( &h->lookahead->ifbuf );
+    delayed_frames += x264_synch_frame_list_get_size( &h->lookahead->next );
+    delayed_frames += x264_synch_frame_list_get_size( &h->lookahead->ofbuf );
      return delayed_frames;
  }
diff --git a/encoder/lookahead.c b/encoder/lookahead.c

new file mode 100644 (file)

index 0000000..9df0ce3
--- /dev/null
+++ b/encoder/lookahead.c
@@ -0,0 +1,278 @@
+/*****************************************************************************
+ * lookahead.c: Lookahead slicetype decisions for x264
+ *****************************************************************************
+ * Lookahead.c and associated modifications:
+ *     Copyright (C) 2008 Avail Media
+ *
+ * Authors: Michael Kazmier <mkazmier@availmedia.com>
+ *          Alex Giladi <agiladi@availmedia.com>
+ *          Steven Walters <kemuri9@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+/* LOOKAHEAD (threaded and non-threaded mode)
+ *
+ * Lookahead types:
+ *     [1] Slice type / scene cut;
+ *
+ * In non-threaded mode, we run the existing slicetype decision code as it was.
+ * In threaded mode, we run in a separate thread, that lives between the calls
+ * to x264_encoder_open() and x264_encoder_close(), and performs lookahead for
+ * the number of frames specified in rc_lookahead.  Recommended setting is
+ * # of bframes + # of threads.
+ */
+#include "common/common.h"
+#include "common/cpu.h"
+#include "analyse.h"
+
+static void x264_lookahead_shift( x264_synch_frame_list_t *dst, x264_synch_frame_list_t *src, int count )
+{
+    int i = count;
+    while( i-- )
+    {
+        assert( dst->i_size != dst->i_max_size );
+        assert( src->i_size );
+        dst->list[ dst->i_size++ ] = x264_frame_shift( src->list );
+        src->i_size--;
+    }
+    if( count )
+    {
+        x264_pthread_cond_broadcast( &dst->cv_fill );
+        x264_pthread_cond_broadcast( &src->cv_empty );
+    }
+}
+
+static void x264_lookahead_update_last_nonb( x264_t *h, x264_frame_t *new_nonb )
+{
+    if( h->lookahead->last_nonb )
+        x264_frame_push_unused( h, h->lookahead->last_nonb );
+    h->lookahead->last_nonb = new_nonb;
+    new_nonb->i_reference_count++;
+}
+
+#ifdef HAVE_PTHREAD
+static void x264_lookahead_slicetype_decide( x264_t *h )
+{
+    int bframes = 0;
+    x264_stack_align( x264_slicetype_decide, h );
+
+    while( IS_X264_TYPE_B( h->lookahead->next.list[bframes]->i_type ) )
+        bframes++;
+    x264_lookahead_update_last_nonb( h, h->lookahead->next.list[bframes] );
+
+    x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+    while( h->lookahead->ofbuf.i_size == h->lookahead->ofbuf.i_max_size )
+        x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_empty, &h->lookahead->ofbuf.mutex );
+
+    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+    x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, bframes + 1 );
+    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+
+    /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
+    if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
+        x264_stack_align( x264_slicetype_analyse, h, 1 );
+
+    x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
+}
+
+static void x264_lookahead_thread( x264_t *h )
+{
+    int shift;
+#ifdef HAVE_MMX
+    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
+        x264_cpu_mask_misalign_sse();
+#endif
+    h->lookahead->b_thread_active = 1;
+    while( !h->lookahead->b_exit_thread )
+    {
+        x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+        x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+        shift = X264_MIN( h->lookahead->next.i_max_size - h->lookahead->next.i_size, h->lookahead->ifbuf.i_size );
+        x264_lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, shift );
+        x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+        if( h->lookahead->next.i_size <= h->lookahead->i_slicetype_length )
+        {
+            while( !h->lookahead->ifbuf.i_size && !h->lookahead->b_exit_thread )
+                x264_pthread_cond_wait( &h->lookahead->ifbuf.cv_fill, &h->lookahead->ifbuf.mutex );
+            x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+        }
+        else
+        {
+            x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+            x264_lookahead_slicetype_decide( h );
+        }
+    }   /* end of input frames */
+    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+    x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+    x264_lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, h->lookahead->ifbuf.i_size );
+    x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+    while( h->lookahead->next.i_size )
+        x264_lookahead_slicetype_decide( h );
+    h->lookahead->b_thread_active = 0;
+}
+#endif
+
+int x264_lookahead_init( x264_t *h, int i_slicetype_length )
+{
+    x264_lookahead_t *look;
+    CHECKED_MALLOCZERO( look, sizeof(x264_lookahead_t) );
+    int i;
+    for( i = 0; i < h->param.i_threads; i++ )
+        h->thread[i]->lookahead = look;
+
+    look->i_last_idr = - h->param.i_keyint_max;
+    look->b_analyse_keyframe = (h->param.rc.b_mb_tree || (h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead))
+                               && !h->param.rc.b_stat_read;
+    look->i_slicetype_length = i_slicetype_length;
+
+    /* init frame lists */
+    if( x264_synch_frame_list_init( &look->ifbuf, h->param.i_sync_lookahead+3 ) ||
+        x264_synch_frame_list_init( &look->next, h->frames.i_delay+3 ) ||
+        x264_synch_frame_list_init( &look->ofbuf, h->frames.i_delay+3 ) )
+        goto fail;
+
+    if( !h->param.i_sync_lookahead )
+        return 0;
+
+    x264_t *look_h = h->thread[h->param.i_threads];
+    *look_h = *h;
+    if( x264_macroblock_cache_init( look_h ) )
+        goto fail;
+
+    UNUSED x264_pthread_attr_t attr;
+    if( x264_pthread_attr_init( &attr ) )
+        goto fail;
+#if defined(USE_REAL_PTHREAD) && !defined(SYS_LINUX)
+    int offset = sched_get_priority_max( SCHED_OTHER );
+    x264_log( h, X264_LOG_DEBUG, "setting priority of lookahead thread to %d\n", offset );
+    struct sched_param sp;
+    pthread_attr_getschedparam( &attr, &sp );
+    sp.sched_priority = offset;
+    pthread_attr_setschedparam( &attr, &sp );
+#endif
+
+    if( x264_pthread_create( &look_h->thread_handle, &attr, (void *)x264_lookahead_thread, look_h ) )
+        goto fail;
+
+    x264_pthread_attr_destroy( &attr );
+
+    return 0;
+fail:
+    x264_free( look );
+    return -1;
+}
+
+void x264_lookahead_delete( x264_t *h )
+{
+    if( h->param.i_sync_lookahead )
+    {
+        h->lookahead->b_exit_thread = 1;
+        x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
+        x264_pthread_join( h->thread[h->param.i_threads]->thread_handle, NULL );
+        x264_macroblock_cache_end( h->thread[h->param.i_threads] );
+        x264_free( h->thread[h->param.i_threads] );
+    }
+    x264_synch_frame_list_delete( &h->lookahead->ifbuf );
+    x264_synch_frame_list_delete( &h->lookahead->next );
+    x264_synch_frame_list_delete( &h->lookahead->ofbuf );
+    if( h->lookahead->last_nonb )
+        x264_frame_delete( h->lookahead->last_nonb );
+    x264_free( h->lookahead );
+}
+
+void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame )
+{
+    if( h->param.i_sync_lookahead )
+        x264_synch_frame_list_push( &h->lookahead->ifbuf, frame );
+    else
+        x264_synch_frame_list_push( &h->lookahead->next, frame );
+}
+
+int x264_lookahead_is_empty( x264_t *h )
+{
+    return !x264_synch_frame_list_get_size( &h->lookahead->ofbuf ) &&
+           !x264_synch_frame_list_get_size( &h->lookahead->next );
+}
+
+static void x264_lookahead_encoder_shift( x264_t *h )
+{
+    int bframes  = 0;
+    int i_frames = 0;
+
+    while( h->lookahead->ofbuf.list[i_frames] )
+    {
+        while( h->lookahead->b_thread_active && !h->lookahead->ofbuf.i_size )
+            x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_fill, &h->lookahead->ofbuf.mutex );
+        if( IS_X264_TYPE_B( h->lookahead->ofbuf.list[bframes]->i_type ) )
+            bframes++;
+        else
+            break;
+        i_frames++;
+    }
+    if( h->lookahead->ofbuf.list[i_frames] )
+    {
+        x264_frame_push( h->frames.current, x264_frame_shift( &h->lookahead->ofbuf.list[bframes] ) );
+        h->lookahead->ofbuf.i_size--;
+        if( h->param.b_bframe_pyramid && bframes > 1 )
+        {
+            x264_frame_t *mid = x264_frame_shift( &h->lookahead->ofbuf.list[bframes/2] );
+            h->lookahead->ofbuf.i_size--;
+            mid->i_type = X264_TYPE_BREF;
+            x264_frame_push( h->frames.current, mid );
+            bframes--;
+        }
+        while( bframes-- )
+        {
+            x264_frame_push( h->frames.current, x264_frame_shift( h->lookahead->ofbuf.list ) );
+            h->lookahead->ofbuf.i_size--;
+        }
+        x264_pthread_cond_broadcast( &h->lookahead->ofbuf.cv_empty );
+    }
+}
+
+void x264_lookahead_get_frames( x264_t *h )
+{
+    if( h->param.i_sync_lookahead )
+    {   /* We have a lookahead thread, so get frames from there */
+        x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+        while( !h->lookahead->ofbuf.i_size && h->lookahead->b_thread_active )
+            x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_fill, &h->lookahead->ofbuf.mutex );
+        x264_lookahead_encoder_shift( h );
+        x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
+    }
+    else
+    {   /* We are not running a lookahead thread, so perform all the slicetype decide on the fly */
+
+        if( h->frames.current[0] || !h->lookahead->next.i_size )
+            return;
+
+        x264_stack_align( x264_slicetype_decide, h );
+
+        int bframes=0;
+        while( IS_X264_TYPE_B( h->lookahead->next.list[bframes]->i_type ) )
+            bframes++;
+
+        x264_lookahead_update_last_nonb( h, h->lookahead->next.list[bframes] );
+        x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, bframes + 1 );
+
+        /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
+        if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
+            x264_stack_align( x264_slicetype_analyse, h, 1 );
+
+        x264_lookahead_encoder_shift( h );
+    }
+}
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c

index ca19d64a82460870158794fe1ea0a4f4f1b6323c..cb7fd3b86dc3c0d1d507eaf15255e7db2553265f 100644 (file)
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -922,11 +922,7 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp )
      }
  
      if( h->sh.i_type != SLICE_TYPE_B )
-    {
-        rc->bframes = 0;
-        while( h->frames.current[rc->bframes] && IS_X264_TYPE_B(h->frames.current[rc->bframes]->i_type) )
-            rc->bframes++;
-    }
+        rc->bframes = h->fenc->i_bframes;
  
      if( i_force_qp )
      {
@@ -1250,7 +1246,7 @@ int x264_ratecontrol_end( x264_t *h, int bits )
          if( h->sh.i_type == SLICE_TYPE_B )
          {
              rc->bframe_bits += bits;
-            if( !h->frames.current[0] || !IS_X264_TYPE_B(h->frames.current[0]->i_type) )
+            if( h->fenc->b_last_minigop_bframe )
              {
                  update_predictor( rc->pred_b_from_p, qp2qscale(rc->qpa_rc),
                                    h->fref1[h->i_ref1-1]->i_satd, rc->bframe_bits / rc->bframes );
diff --git a/encoder/slicetype.c b/encoder/slicetype.c

index af74427de5655644b76c914960072218c0bcdbb4..88aff91b36c1e87a6f411919475c90dfe92e34d9 100644 (file)
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -625,7 +625,7 @@ static int x264_slicetype_path_cost( x264_t *h, x264_mb_analysis_t *a, x264_fram
  /* Uses strings due to the fact that the speed of the control functions is
     negligable compared to the cost of running slicetype_frame_cost, and because
     it makes debugging easier. */
-static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int max_bframes, int buffer_size, char (*best_paths)[X264_LOOKAHEAD_MAX] )
+static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int max_bframes, char (*best_paths)[X264_LOOKAHEAD_MAX] )
  {
      char paths[X264_BFRAME_MAX+2][X264_LOOKAHEAD_MAX] = {{0}};
      int num_paths = X264_MIN(max_bframes+1, length);
@@ -666,7 +666,7 @@ static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, in
      int icost = frame->i_cost_est[0][0];
      int pcost = frame->i_cost_est[p1-p0][0];
      float f_bias;
-    int i_gop_size = frame->i_frame - h->frames.i_last_idr;
+    int i_gop_size = frame->i_frame - h->lookahead->i_last_idr;
      float f_thresh_max = h->param.i_scenecut_threshold / 100.0;
      /* magic numbers pulled out of thin air */
      float f_thresh_min = f_thresh_max * h->param.i_keyint_min
@@ -700,33 +700,33 @@ static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, in
      return res;
  }
  
-static void x264_slicetype_analyse( x264_t *h, int keyframe )
+void x264_slicetype_analyse( x264_t *h, int keyframe )
  {
      x264_mb_analysis_t a;
      x264_frame_t *frames[X264_LOOKAHEAD_MAX+3] = { NULL, };
-    int num_frames;
-    int keyint_limit;
-    int i,j;
+    int num_frames, keyint_limit, idr_frame_type, i, j;
      int i_mb_count = NUM_MBS;
      int cost1p0, cost2p0, cost1b1, cost2p1;
-    int idr_frame_type;
+    int i_max_search = X264_MIN( h->lookahead->next.i_size, X264_LOOKAHEAD_MAX );
+    if( h->param.b_deterministic )
+        i_max_search = X264_MIN( i_max_search, h->lookahead->i_slicetype_length + !keyframe );
  
      assert( h->frames.b_have_lowres );
  
-    if( !h->frames.last_nonb )
+    if( !h->lookahead->last_nonb )
          return;
-    frames[0] = h->frames.last_nonb;
-    for( j = 0; h->frames.next[j] && h->frames.next[j]->i_type == X264_TYPE_AUTO; j++ )
-        frames[j+1] = h->frames.next[j];
+    frames[0] = h->lookahead->last_nonb;
+    for( j = 0; j < i_max_search && h->lookahead->next.list[j]->i_type == X264_TYPE_AUTO; j++ )
+        frames[j+1] = h->lookahead->next.list[j];
  
      if( !j )
          return;
  
-    keyint_limit = h->param.i_keyint_max - frames[0]->i_frame + h->frames.i_last_idr - 1;
+    keyint_limit = h->param.i_keyint_max - frames[0]->i_frame + h->lookahead->i_last_idr - 1;
      num_frames = X264_MIN( j, keyint_limit );
  
      x264_lowres_context_init( h, &a );
-    idr_frame_type = frames[1]->i_frame - h->frames.i_last_idr >= h->param.i_keyint_min ? X264_TYPE_IDR : X264_TYPE_I;
+    idr_frame_type = frames[1]->i_frame - h->lookahead->i_last_idr >= h->param.i_keyint_min ? X264_TYPE_IDR : X264_TYPE_I;
  
      /* This is important psy-wise: if we have a non-scenecut keyframe,
       * there will be significant visual artifacts if the frames just before
@@ -765,7 +765,7 @@ static void x264_slicetype_analyse( x264_t *h, int keyframe )
          {
              /* Perform the frametype analysis. */
              for( n = 2; n < num_frames-1; n++ )
-                x264_slicetype_path( h, &a, frames, n, max_bframes, num_frames-max_bframes, best_paths );
+                x264_slicetype_path( h, &a, frames, n, max_bframes, best_paths );
              if( num_frames > 1 )
              {
                  num_bframes = strspn( best_paths[num_frames-2], "B" );
@@ -888,15 +888,15 @@ void x264_slicetype_decide( x264_t *h )
      int bframes;
      int i;
  
-    if( h->frames.next[0] == NULL )
+    if( !h->lookahead->next.i_size )
          return;
  
      if( h->param.rc.b_stat_read )
      {
          /* Use the frame types from the first pass */
-        for( i = 0; h->frames.next[i] != NULL; i++ )
-            h->frames.next[i]->i_type =
-                x264_ratecontrol_slice_type( h, h->frames.next[i]->i_frame );
+        for( i = 0; i < h->lookahead->next.i_size; i++ )
+            h->lookahead->next.list[i]->i_type =
+                x264_ratecontrol_slice_type( h, h->lookahead->next.list[i]->i_frame );
      }
      else if( (h->param.i_bframe && h->param.i_bframe_adaptive)
               || h->param.i_scenecut_threshold
@@ -906,10 +906,10 @@ void x264_slicetype_decide( x264_t *h )
  
      for( bframes = 0;; bframes++ )
      {
-        frm = h->frames.next[bframes];
+        frm = h->lookahead->next.list[bframes];
  
          /* Limit GOP size */
-        if( frm->i_frame - h->frames.i_last_idr >= h->param.i_keyint_max )
+        if( frm->i_frame - h->lookahead->i_last_idr >= h->param.i_keyint_max )
          {
              if( frm->i_type == X264_TYPE_AUTO )
                  frm->i_type = X264_TYPE_IDR;
@@ -919,19 +919,16 @@ void x264_slicetype_decide( x264_t *h )
          if( frm->i_type == X264_TYPE_IDR )
          {
              /* Close GOP */
+            h->lookahead->i_last_idr = frm->i_frame;
              if( bframes > 0 )
              {
                  bframes--;
-                h->frames.next[bframes]->i_type = X264_TYPE_P;
-            }
-            else
-            {
-                h->i_frame_num = 0;
+                h->lookahead->next.list[bframes]->i_type = X264_TYPE_P;
              }
          }
  
-        if( bframes == h->param.i_bframe
-            || h->frames.next[bframes+1] == NULL )
+        if( bframes == h->param.i_bframe ||
+            !h->lookahead->next.list[bframes+1] )
          {
              if( IS_X264_TYPE_B( frm->i_type ) )
                  x264_log( h, X264_LOG_WARNING, "specified frame type is not compatible with max B-frames\n" );
@@ -945,45 +942,47 @@ void x264_slicetype_decide( x264_t *h )
  
          else if( !IS_X264_TYPE_B( frm->i_type ) ) break;
      }
+
+    if( bframes )
+        h->lookahead->next.list[bframes-1]->b_last_minigop_bframe = 1;
+    h->lookahead->next.list[bframes]->i_bframes = bframes;
+
+    /* calculate the frame costs ahead of time for x264_rc_analyse_slice while we still have lowres */
+    if( h->param.rc.i_rc_method != X264_RC_CQP )
+    {
+        x264_mb_analysis_t a;
+        x264_frame_t *frames[X264_BFRAME_MAX+2] = { NULL, };
+        int p0=0, p1, b;
+
+        x264_lowres_context_init( h, &a );
+
+        if( IS_X264_TYPE_I( h->lookahead->next.list[bframes]->i_type ) )
+            p1 = b = 0;
+        else // P
+            p1 = b = bframes + 1;
+        frames[p0] = h->lookahead->last_nonb;
+        frames[b] = h->lookahead->next.list[bframes];
+
+        x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 );
+    }
  }
  
  int x264_rc_analyse_slice( x264_t *h )
  {
-    x264_mb_analysis_t a;
-    x264_frame_t *frames[X264_LOOKAHEAD_MAX+2] = { NULL, };
+    x264_frame_t *frames[X264_BFRAME_MAX+2] = { NULL, };
      int p0=0, p1, b;
      int cost;
  
-    x264_lowres_context_init( h, &a );
-
      if( IS_X264_TYPE_I(h->fenc->i_type) )
-    {
          p1 = b = 0;
-        /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
-        if( h->param.rc.b_mb_tree || (h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead) )
-        {
-            h->frames.last_nonb = h->fenc;
-            x264_slicetype_analyse( h, 1 );
-        }
-    }
-    else if( X264_TYPE_P == h->fenc->i_type )
-    {
-        p1 = 0;
-        while( h->frames.current[p1] && IS_X264_TYPE_B( h->frames.current[p1]->i_type ) )
-            p1++;
-        p1++;
-        b = p1;
-    }
-    else //B
-    {
-        p1 = (h->fref1[0]->i_poc - h->fref0[0]->i_poc)/2;
-        b  = (h->fref1[0]->i_poc - h->fenc->i_poc)/2;
-        frames[p1] = h->fref1[0];
-    }
+    else // P
+        p1 = b = h->fenc->i_bframes + 1;
      frames[p0] = h->fref0[0];
      frames[b] = h->fenc;
  
-    cost = x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 );
+    /* cost should have been already calculated by x264_slicetype_decide */
+    cost = frames[b]->i_cost_est[b-p0][p1-b];
+    assert( cost >= 0 );
  
      if( h->param.rc.b_mb_tree && !h->param.rc.b_stat_read )
          cost = x264_slicetype_frame_cost_recalculate( h, frames, p0, p1, b );
diff --git a/x264.c b/x264.c

index 76e53072d231c3862b267fa38cd48ceaf3cabc8a..0ed538a003e75b238d13a9ce8a78174771b34c68 100644 (file)
--- a/x264.c
+++ b/x264.c
@@ -355,6 +355,7 @@ static void Help( x264_param_t *defaults, int b_longhelp )
      H0( "      --ssim                  Enable SSIM computation\n" );
      H0( "      --threads <integer>     Force a specific number of threads\n" );
      H1( "      --thread-input          Run Avisynth in its own thread\n" );
+    H1( "      --sync-lookahead <integer> Number of buffer frames for threaded lookahead\n" );
      H1( "      --non-deterministic     Slightly improve quality of SMP, at the cost of repeatability\n" );
      H1( "      --asm <integer>         Override CPU detection\n" );
      H1( "      --no-asm                Disable all CPU optimizations\n" );
@@ -467,6 +468,7 @@ static struct option long_options[] =
      { "slice-max-mbs",     required_argument, NULL, 0 },
      { "slices",            required_argument, NULL, 0 },
      { "thread-input",      no_argument, NULL, OPT_THREAD_INPUT },
+    { "sync-lookahead",    required_argument, NULL, 0 },
      { "non-deterministic", no_argument, NULL, 0 },
      { "psnr",              no_argument, NULL, 0 },
      { "ssim",              no_argument, NULL, 0 },
@@ -988,7 +990,7 @@ generic_option:
  
  #ifdef HAVE_PTHREAD
      if( b_thread_input || param->i_threads > 1
-        || (param->i_threads == 0 && x264_cpu_num_processors() > 1) )
+        || (param->i_threads == X264_THREADS_AUTO && x264_cpu_num_processors() > 1) )
      {
          if( open_file_thread( NULL, &opt->hin, param ) )
          {
diff --git a/x264.h b/x264.h

index 5e6d411788a64bc16da0d7fa11aecd92a0f1d018..66f4f282b151982532c41800f3956685c8de6f99 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@
  
  #include <stdarg.h>
  
-#define X264_BUILD 74
+#define X264_BUILD 75
  
  /* x264_t:
   *      opaque handler for encoder */
@@ -139,6 +139,7 @@ static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", ""
  
  /* Threading */
  #define X264_THREADS_AUTO 0 /* Automatically select optimal number of threads */
+#define X264_SYNC_LOOKAHEAD_AUTO -1 /* Automatically select optimal lookahead thread buffer size */
  
  /* Zones: override ratecontrol or other options for specific sections of the video.
   * See x264_encoder_reconfig() for which options can be changed.
@@ -158,6 +159,7 @@ typedef struct x264_param_t
      unsigned int cpu;
      int         i_threads;       /* encode multiple frames in parallel */
      int         b_deterministic; /* whether to allow non-deterministic optimizations when threaded */
+    int         i_sync_lookahead; /* threaded lookahead buffer */
  
      /* Video Properties */
      int         i_width;
author	Steven Walters <kemuri9@gmail.com>
	Wed, 2 Sep 2009 01:46:51 +0000 (18:46 -0700)
committer	Fiona Glaser <fiona@x264.com>
	Wed, 2 Sep 2009 04:06:20 +0000 (21:06 -0700)
Makefile		patch \| blob \| history
common/common.c		patch \| blob \| history
common/common.h		patch \| blob \| history
common/cpu.h		patch \| blob \| history
common/frame.c		patch \| blob \| history
common/frame.h		patch \| blob \| history
common/macroblock.c		patch \| blob \| history
common/osdep.h		patch \| blob \| history
common/x86/cpu-a.asm		patch \| blob \| history
encoder/analyse.h		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
encoder/lookahead.c	[new file with mode: 0644]	patch \| blob
encoder/ratecontrol.c		patch \| blob \| history
encoder/slicetype.c		patch \| blob \| history
x264.c		patch \| blob \| history
x264.h		patch \| blob \| history