New threading method:

author Loren Merritt <pengvado@videolan.org>

Fri, 15 Dec 2006 23:03:36 +0000 (23:03 +0000)

committer Loren Merritt <pengvado@videolan.org>

Fri, 15 Dec 2006 23:03:36 +0000 (23:03 +0000)
author Loren Merritt <pengvado@videolan.org>
Fri, 15 Dec 2006 23:03:36 +0000 (23:03 +0000)
committer Loren Merritt <pengvado@videolan.org>
Fri, 15 Dec 2006 23:03:36 +0000 (23:03 +0000)
diff --git a/common/amd64/mc-a2.asm b/common/amd64/mc-a2.asm

index 3591d1ea82c4a272650ae48e1b10c1ba3cb8e920..40c9a824170a3092c0a799c91dbd5f37a1c21218 100644 (file)
--- a/common/amd64/mc-a2.asm
+++ b/common/amd64/mc-a2.asm
@@ -158,7 +158,7 @@ x264_hpel_filter_mmxext :
  ALIGN 16
  .vertical_filter:
  
-    prefetchnta [src + stride5 + 32]
+    prefetcht0  [src + stride5 + 32]
  
      LOAD_ADD    mm1,    [src               ], [src + stride5     ] ; a0
      LOAD_ADD    mm2,    [src + stride      ], [src + stride*4    ] ; b0
diff --git a/common/common.c b/common/common.c

index 4bc8fc099fad7294917375e9b2a692085480bbdb..00b1d9fd14a368fa66313a1ebe55abbd0cc6bb7f 100644 (file)
--- a/common/common.c
+++ b/common/common.c
@@ -45,6 +45,7 @@ void    x264_param_default( x264_param_t *param )
      /* CPU autodetect */
      param->cpu = x264_cpu_detect();
      param->i_threads = 1;
+    param->b_deterministic = 1;
  
      /* Video properties */
      param->i_csp           = X264_CSP_I420;
@@ -118,6 +119,7 @@ void    x264_param_default( x264_param_t *param )
      param->analyse.i_me_range = 16;
      param->analyse.i_subpel_refine = 5;
      param->analyse.b_chroma_me = 1;
+    param->analyse.i_mv_range_thread = -1;
      param->analyse.i_mv_range = -1; // set from level_idc
      param->analyse.i_direct_8x8_inference = -1; // set from level_idc
      param->analyse.i_chroma_qp_offset = 0;
@@ -245,6 +247,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
          else
              p->i_threads = atoi(value);
      }
+    OPT2("deterministic", "n-deterministic")
+        p->b_deterministic = atobool(value);
      OPT2("level", "level-idc")
      {
          if( atof(value) < 6 )
@@ -301,6 +305,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
      }
      OPT("scenecut")
          p->i_scenecut_threshold = atoi(value);
+    OPT("pre-scenecut")
+        p->b_pre_scenecut = atobool(value);
      OPT("bframes")
          p->i_bframe = atoi(value);
      OPT("b-adapt")
@@ -431,8 +437,10 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
          b_error |= parse_enum( value, x264_motion_est_names, &p->analyse.i_me_method );
      OPT2("merange", "me-range")
          p->analyse.i_me_range = atoi(value);
-    OPT("mvrange")
+    OPT2("mvrange", "mv-range")
          p->analyse.i_mv_range = atoi(value);
+    OPT2("mvrange-thread", "mv-range-thread")
+        p->analyse.i_mv_range_thread = atoi(value);
      OPT2("subme", "subq")
          p->analyse.i_subpel_refine = atoi(value);
      OPT("bime")
@@ -879,7 +887,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
      s += sprintf( s, " cqm=%d", p->i_cqm_preset );
      s += sprintf( s, " deadzone=%d,%d", p->analyse.i_luma_deadzone[0], p->analyse.i_luma_deadzone[1] );
      s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset );
-    s += sprintf( s, " slices=%d", p->i_threads );
+    s += sprintf( s, " threads=%d", p->i_threads );
      s += sprintf( s, " nr=%d", p->analyse.i_noise_reduction );
      s += sprintf( s, " decimate=%d", p->analyse.b_dct_decimate );
      s += sprintf( s, " mbaff=%d", p->b_interlaced );
@@ -893,8 +901,9 @@ char *x264_param2string( x264_param_t *p, int b_res )
                        p->analyse.b_bidir_me );
      }
  
-    s += sprintf( s, " keyint=%d keyint_min=%d scenecut=%d",
-                  p->i_keyint_max, p->i_keyint_min, p->i_scenecut_threshold );
+    s += sprintf( s, " keyint=%d keyint_min=%d scenecut=%d%s",
+                  p->i_keyint_max, p->i_keyint_min, p->i_scenecut_threshold,
+                  p->b_pre_scenecut ? "(pre)" : "" );
  
      s += sprintf( s, " rc=%s", p->rc.i_rc_method == X264_RC_ABR ?
                                 ( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_buffer_size ? "cbr" : "abr" )
diff --git a/common/common.h b/common/common.h

index c823d3da695a2679db2525ba0c5c08ef6e840ea0..a166a8311b6411be69023bfe1ac135b13ea6ed45 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -52,6 +52,7 @@
  #define pthread_create(t,u,f,d) *(t)=CreateThread(NULL,0,f,d,0,NULL)
  #define pthread_join(t,s)       { WaitForSingleObject(t,INFINITE); \
                                    CloseHandle(t); } 
+#define usleep(t)               Sleep((t+999)/1000);
  #define HAVE_PTHREAD 1
  
  #elif defined(SYS_BEOS)
@@ -61,10 +62,17 @@
                                    resume_thread(*(t)); }
  #define pthread_join(t,s)       { long tmp; \
                                    wait_for_thread(t,(s)?(long*)(s):&tmp); }
+#ifndef usleep
+#define usleep(t)               snooze(t)
+#endif
  #define HAVE_PTHREAD 1
  
  #elif defined(HAVE_PTHREAD)
  #include <pthread.h>
+#else
+#define pthread_t               int
+#define pthread_create(t,u,f,d)
+#define pthread_join(t,s)
  #endif
  
  /****************************************************************************
@@ -79,6 +87,10 @@
  #define XCHG(type,a,b) { type t = a; a = b; b = t; }
  #define FIX8(f) ((int)(f*(1<<8)+.5))
  
+#ifndef offsetof
+#define offsetof(T,F) ((unsigned int)((char *)&((T *)0)->F))
+#endif
+
  #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
  #define UNUSED __attribute__((unused))
  #else
@@ -96,8 +108,10 @@
  }
  
  #define X264_BFRAME_MAX 16
+#define X264_THREAD_MAX 16
  #define X264_SLICE_MAX 4
  #define X264_NAL_MAX (4 + X264_SLICE_MAX)
+#define X264_THREAD_HEIGHT 24 // number of pixels (per thread) in progress at any given time. could theoretically be as low as 22
  
  /****************************************************************************
   * Includes
@@ -272,7 +286,10 @@ struct x264_t
      /* encoder parameters */
      x264_param_t    param;
  
-    x264_t *thread[X264_SLICE_MAX];
+    x264_t          *thread[X264_THREAD_MAX];
+    pthread_t       thread_handle;
+    int             b_thread_active;
+    int             i_thread_phase; /* which thread to use for the next frame */
  
      /* bitstream output */
      struct
@@ -282,6 +299,7 @@ struct x264_t
          int         i_bitstream;    /* size of p_bitstream */
          uint8_t     *p_bitstream;   /* will hold data for all nal */
          bs_t        bs;
+        int         i_frame_size;
      } out;
  
      /* frame number/poc */
@@ -328,7 +346,7 @@ struct x264_t
          /* Temporary buffer (frames types not yet decided) */
          x264_frame_t *next[X264_BFRAME_MAX+3];
          /* Unused frames */
-        x264_frame_t *unused[X264_BFRAME_MAX+3];
+        x264_frame_t *unused[X264_BFRAME_MAX + X264_THREAD_MAX*2 + 16+4];
          /* For adaptive B decision */
          x264_frame_t *last_nonb;
  
@@ -439,6 +457,7 @@ struct x264_t
          int16_t (*mvr[2][32])[2];           /* 16x16 mv for each possible ref */
          int8_t  *skipbp;                    /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
          int8_t  *mb_transform_size;         /* transform_size_8x8_flag of each mb */
+        uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
  
          /* current value */
          int     i_type;
@@ -550,6 +569,7 @@ struct x264_t
              /* XXX: both omit the cost of MBs coded as P_SKIP */
              int i_intra_cost;
              int i_inter_cost;
+            int i_mbs_analysed;
              /* Adaptive direct mv pred */
              int i_direct_score[2];
          } frame;
diff --git a/common/frame.c b/common/frame.c

index 7720947984d5a71ac7c1ef72d92be76320bc229f..ea280d489b8ea3c2870217eead79e9f61ce30aee 100644 (file)
--- a/common/frame.c
+++ b/common/frame.c
@@ -109,6 +109,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
      frame->i_pts = -1;
      frame->i_frame = -1;
      frame->i_frame_num = -1;
+    frame->i_lines_completed = -1;
  
      CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
      CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
@@ -172,7 +173,7 @@ void x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src
  
  
  
-static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv )
+static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
  {
  #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
      int y;
@@ -184,56 +185,68 @@ static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_
          memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
      }
      /* upper band */
+    if( b_pad_top )
      for( y = 0; y < i_padv; y++ )
          memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
      /* lower band */
+    if( b_pad_bottom )
      for( y = 0; y < i_padv; y++ )
          memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
  #undef PPIXEL
  }
  
-void x264_frame_expand_border( x264_t *h, x264_frame_t *frame )
+void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
  {
      int i;
+    int b_start = !mb_y;
+    if( mb_y & h->sh.b_mbaff )
+        return;
      for( i = 0; i < frame->i_plane; i++ )
      {
          int stride = frame->i_stride[i];
          int width = 16*h->sps->i_mb_width >> !!i;
-        int height = 16*h->sps->i_mb_height >> !!i;
+        int height = (b_end ? 16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
          int padh = PADH >> !!i;
          int padv = PADV >> !!i;
-        if( h->param.b_interlaced )
+        // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
+        uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
+        if( b_end && !b_start )
+            height += 4 >> (!!i + h->sh.b_mbaff);
+        if( h->sh.b_mbaff )
          {
-            plane_expand_border( frame->plane[i], stride*2, width, height>>1, padh, padv );
-            plane_expand_border( frame->plane[i]+stride, stride*2, width, height>>1, padh, padv );
+            plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
+            plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
          }
          else
          {
-            plane_expand_border( frame->plane[i], stride, width, height, padh, padv );
+            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
          }
      }
  }
  
-void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame )
+void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
  {
      /* during filtering, 8 extra pixels were filtered on each edge. 
         we want to expand border from the last filtered pixel */
+    int b_start = !mb_y;
      int stride = frame->i_stride[0];
-    int width = 16*h->sps->i_mb_width;
-    int height = 16*h->sps->i_mb_height;
+    int width = 16*h->sps->i_mb_width + 16;
+    int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
      int padh = PADH - 8;
      int padv = PADV - 8;
      int i;
      for( i = 1; i < 4; i++ )
      {
-        if( h->param.b_interlaced )
+        // buffer: 8 luma, to match the hpel filter
+        uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 8;
+        if( h->sh.b_mbaff )
          {
-            plane_expand_border( frame->filtered[i] - 16*stride - 8, stride*2, width+16, (height>>1)+16, padh, padv );
-            plane_expand_border( frame->filtered[i] - 15*stride - 8, stride*2, width+16, (height>>1)+16, padh, padv );
+            plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
+            plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
          }
          else
          {
-            plane_expand_border( frame->filtered[i] - 8*stride - 8, stride, width+16, height+16, padh, padv );
+            plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
          }
      }
  }
@@ -242,7 +255,7 @@ void x264_frame_expand_border_lowres( x264_frame_t *frame )
  {
      int i;
      for( i = 0; i < 4; i++ )
-        plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_stride_lowres - 2*PADH, frame->i_lines_lowres, PADH, PADV );
+        plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_stride_lowres - 2*PADH, frame->i_lines_lowres, PADH, PADV, 1, 1 );
  }
  
  void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
@@ -505,19 +518,19 @@ static inline void deblock_edge( x264_t *h, uint8_t *pix, int i_stride, int bS[4
      }
  }
  
-void x264_frame_deblocking_filter( x264_t *h, int i_slice_type )
+void x264_frame_deblock_row( x264_t *h, int mb_y )
  {
      const int s8x8 = 2 * h->mb.i_mb_stride;
      const int s4x4 = 4 * h->mb.i_mb_stride;
-    const int b_interlaced = h->param.b_interlaced;
+    const int b_interlaced = h->sh.b_mbaff;
      const int mvy_limit = 4 >> b_interlaced;
-    int mb_y, mb_x;
+    int mb_x;
  
      int i_stride2[3] = { h->fdec->i_stride[0] << b_interlaced,
                           h->fdec->i_stride[1] << b_interlaced,
                           h->fdec->i_stride[2] << b_interlaced };
  
-    for( mb_y = 0, mb_x = 0; mb_y < h->sps->i_mb_height; )
+    for( mb_x = 0; mb_x < h->sps->i_mb_width; )
      {
          const int mb_xy  = mb_y * h->mb.i_mb_stride + mb_x;
          const int mb_8x8 = 2 * s8x8 * mb_y + 2 * mb_x;
@@ -610,7 +623,7 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type )
  
                              bS[i] = 0;
  
-                            for( l = 0; l < 1 + (i_slice_type == SLICE_TYPE_B); l++ )
+                            for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )
                              {
                                  if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] ||
                                      abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 ||
@@ -673,16 +686,17 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type )
          /* next mb */
          if( !b_interlaced || (mb_y&1) )
              mb_x++;
-        if( mb_x >= h->sps->i_mb_width )
-        {
-            mb_x = 0;
-            mb_y++;
-        }
-        else
-            mb_y ^= b_interlaced;
+        mb_y ^= b_interlaced;
      }
  }
  
+void x264_frame_deblock( x264_t *h )
+{
+    int mb_y;
+    for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y += 1 + h->sh.b_mbaff )
+        x264_frame_deblock_row( h, mb_y );
+}
+
  #ifdef HAVE_MMXEXT
  void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
  void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
diff --git a/common/frame.h b/common/frame.h

index ce802b8bd85fb02291186f09babd901dba11d7b0..09fece187b765d9b75c727c6739fa21f19d6cca3 100644 (file)
--- a/common/frame.h
+++ b/common/frame.h
@@ -70,6 +70,10 @@ typedef struct
      int     *i_row_bits;
      int     *i_row_qp;
  
+    /* threading */
+    int     i_lines_completed; /* in pixels */
+    int     i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */
+
  } x264_frame_t;
  
  typedef void (*x264_deblock_inter_t)( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
@@ -91,14 +95,15 @@ void          x264_frame_delete( x264_frame_t *frame );
  
  void          x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );
  
-void          x264_frame_expand_border( x264_t *h, x264_frame_t *frame );
-void          x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame );
+void          x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
+void          x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end );
  void          x264_frame_expand_border_lowres( x264_frame_t *frame );
  void          x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame );
  
-void          x264_frame_deblocking_filter( x264_t *h, int i_slice_type );
+void          x264_frame_deblock( x264_t *h );
+void          x264_frame_deblock_row( x264_t *h, int mb_y );
  
-void          x264_frame_filter( int cpu, x264_frame_t *frame, int b_interlaced );
+void          x264_frame_filter( int cpu, x264_frame_t *frame, int b_interlaced, int mb_y, int b_end );
  void          x264_frame_init_lowres( int cpu, x264_frame_t *frame );
  
  void          x264_deblock_init( int cpu, x264_deblock_function_t *pf );
diff --git a/common/i386/mc-a2.asm b/common/i386/mc-a2.asm

index 240468defa31e37ce85d32a6829b366c1208d44f..b9eb7e50d82a3ca0c98d7c9b66ec846f3c526341 100644 (file)
--- a/common/i386/mc-a2.asm
+++ b/common/i386/mc-a2.asm
@@ -157,7 +157,7 @@ x264_hpel_filter_mmxext :
  ALIGN 16
  .vertical_filter:
  
-    prefetchnta [src3 + stride*2 + 32]
+    prefetcht0  [src3 + stride*2 + 32]
  
      LOAD_ADD    mm1,    [src               ], [src3 + stride*2    ] ; a0
      LOAD_ADD    mm2,    [src + stride      ], [src3 + stride      ] ; b0
diff --git a/common/macroblock.c b/common/macroblock.c

index f9744e8b3660cf686cce251f9e3679da67a8594f..5f3089ec2444fde5f4612bdba1c2a46c67bf9a09 100644 (file)
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -308,6 +308,27 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
          }
      }
  
+    if( h->param.i_threads > 1 )
+    {
+        int di = b8x8 ? 4 : 1;
+        for( i4=0; i4<16; i4+=di )
+        {
+            if( h->mb.cache.mv[0][x264_scan8[i4]][1] > h->mb.mv_max_spel[1]
+             || h->mb.cache.mv[1][x264_scan8[i4]][1] > h->mb.mv_max_spel[1] )
+            {
+#if 0
+                fprintf(stderr, "direct_temporal: (%d,%d) (%d,%d) > %d \n",
+                        h->mb.cache.mv[0][x264_scan8[i4]][0],
+                        h->mb.cache.mv[0][x264_scan8[i4]][1],
+                        h->mb.cache.mv[1][x264_scan8[i4]][0],
+                        h->mb.cache.mv[1][x264_scan8[i4]][1],
+                        h->mb.mv_max_spel[1]);
+#endif
+                return 0;
+            }
+        }
+    }
+
      return 1;
  }
  
@@ -368,6 +389,19 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
  
      if( IS_INTRA( type_col ) )
          return 1;
+
+    if( h->param.i_threads > 1
+        && ( mv[0][1] > h->mb.mv_max_spel[1]
+          || mv[1][1] > h->mb.mv_max_spel[1] ) )
+    {
+#if 0
+        fprintf(stderr, "direct_spatial: (%d,%d) (%d,%d) > %d \n",
+                mv[0][0], mv[0][1], mv[1][0], mv[1][1],
+                h->mb.mv_max_spel[1]);
+#endif
+        return 0;
+    }
+
      b8x8 = h->sps->b_direct8x8_inference ||
             (type_col != P_8x8 && type_col != B_SKIP && type_col != B_DIRECT && type_col != B_8x8);
  
@@ -861,6 +895,13 @@ int x264_macroblock_cache_init( x264_t *h )
              CHECKED_MALLOC( h->mb.mvr[i][j], 2 * i_mb_count * sizeof(int16_t) );
      }
  
+    for( i=0; i<=h->param.b_interlaced; i++ )
+        for( j=0; j<3; j++ )
+        {
+            CHECKED_MALLOC( h->mb.intra_border_backup[i][j], h->fdec->i_stride[j] );
+            h->mb.intra_border_backup[i][j] += 8;
+        }
+
      /* init with not available (for top right idx=7,15) */
      memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
      memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
@@ -871,6 +912,9 @@ fail: return -1;
  void x264_macroblock_cache_end( x264_t *h )
  {
      int i, j;
+    for( i=0; i<=h->param.b_interlaced; i++ )
+        for( j=0; j<3; j++ )
+            x264_free( h->mb.intra_border_backup[i][j] - 8 );
      for( i=0; i<2; i++ )
      {
          int i_refs = i ? 1 + h->param.b_bframe_pyramid : h->param.i_frame_reference;
@@ -1117,6 +1161,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                                 : w * (i_mb_x + i_mb_y * i_stride);
          int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
          const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
+        const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
          x264_frame_t **fref[2] = { h->fref0, h->fref1 };
          int j, k, l;
  
@@ -1127,7 +1172,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
  
          h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
              &h->fenc->plane[i][i_pix_offset], i_stride2, w );
-        memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], &plane_fdec[-1-i_stride2], w*3/2+1 );
+        memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
          for( j = 0; j < w; j++ )
              h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
  
diff --git a/common/mc.c b/common/mc.c

index 7721b63a34461291259472c4b31bf3fd0d2009a6..6944281ce9b2282a3174520c5e75e12cbaed86f4 100644 (file)
--- a/common/mc.c
+++ b/common/mc.c
@@ -390,46 +390,48 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
  extern void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
                                       int i_stride, int i_width, int i_height );
  
-void x264_frame_filter( int cpu, x264_frame_t *frame, int b_interlaced )
+void x264_frame_filter( int cpu, x264_frame_t *frame, int b_interlaced, int mb_y, int b_end )
  {
      const int x_inc = 16, y_inc = 16;
      const int stride = frame->i_stride[0] << b_interlaced;
-    const int height = frame->i_lines[0] >> b_interlaced;
+    const int start = (mb_y*16 >> b_interlaced) - 8;
+    const int height = ((b_end ? frame->i_lines[0] : mb_y*16) >> b_interlaced) + 8;
      int x, y;
  
-    pf_mc_t int_h = mc_hh;
-    pf_mc_t int_v = mc_hv;
-    pf_mc_t int_hv = mc_hc;
+    if( mb_y & b_interlaced )
+        return;
+    mb_y >>= b_interlaced;
  
  #ifdef HAVE_MMXEXT
      if ( cpu & X264_CPU_MMXEXT )
      {
-        int offs = -8*stride - 8;
+        // buffer = 4 for deblock + 3 for 6tap, rounded to 8
+        int offs = start*stride - 8;
          x264_hpel_filter_mmxext(
              frame->filtered[1] + offs,
              frame->filtered[2] + offs,
              frame->filtered[3] + offs,
              frame->plane[0] + offs,
-            stride, stride - 48, height + 16);
+            stride, stride - 48, height - start );
      }
      else
  #endif
      {
-        for( y = -8; y < height + 8; y += y_inc )
+        for( y = start; y < height; y += y_inc )
          {
              uint8_t *p_in = frame->plane[0] + y * stride - 8;
              uint8_t *p_h  = frame->filtered[1] + y * stride - 8;
              uint8_t *p_v  = frame->filtered[2] + y * stride - 8;
-            uint8_t *p_hv = frame->filtered[3] + y * stride - 8;
+            uint8_t *p_c  = frame->filtered[3] + y * stride - 8;
              for( x = -8; x < stride - 64 + 8; x += x_inc )
              {
-                int_h(  p_in, stride, p_h,  stride, x_inc, y_inc );
-                int_v(  p_in, stride, p_v,  stride, x_inc, y_inc );
-                int_hv( p_in, stride, p_hv, stride, x_inc, y_inc );
+                mc_hh( p_in, stride, p_h, stride, x_inc, y_inc );
+                mc_hv( p_in, stride, p_v, stride, x_inc, y_inc );
+                mc_hc( p_in, stride, p_c, stride, x_inc, y_inc );
  
                  p_h += x_inc;
                  p_v += x_inc;
-                p_hv += x_inc;
+                p_c += x_inc;
                  p_in += x_inc;
              }
          }
@@ -440,8 +442,9 @@ void x264_frame_filter( int cpu, x264_frame_t *frame, int b_interlaced )
       * the sum of an 8x8 pixel region with top-left corner on that point.
       * in the lower plane, 4x4 sums (needed only with --analyse p4x4). */
  
-    if( frame->integral )
+    if( frame->integral && b_end )
      {
+        //FIXME slice
          memset( frame->integral - 32 * stride - 32, 0, stride * sizeof(uint16_t) );
          for( y = -32; y < frame->i_lines[0] + 31; y++ )
          {
diff --git a/doc/threads.txt b/doc/threads.txt

new file mode 100644 (file)

index 0000000..3777b51
--- /dev/null
+++ b/doc/threads.txt
@@ -0,0 +1,70 @@
+Old threading method: slice-based
+application calls x264
+x264 runs B-adapt and ratecontrol (serial)
+split frame into several slices, and spawn a thread for each slice
+wait until all threads are done
+deblock and hpel filter (serial)
+return to application
+In x264cli, there is one additional thread to decode the input.
+
+New threading method: frame-based
+application calls x264
+x264 runs B-adapt and ratecontrol (serial to the application, but parallel to the other x264 threads)
+spawn a thread for this frame
+thread runs encode in 1 slice, deblock, hpel filter
+meanwhile x264 waits for the oldest thread to finish
+return to application, but the rest of the threads continue running in the background
+No additional threads are needed to decode the input, unless decoding+B-adapt is slower than slice+deblock+hpel, in which case an additional input thread would allow decoding in parallel to B-adapt.
+
+
+Penalties for slice-based threading:
+Each slice adds some bitrate (or equivalently reduces quality), for a variety of reasons: the slice header costs some bits, cabac contexts are reset, mvs and intra samples can't be predicted across the slice boundary.
+In CBR mode, we have to allocate bits between slices before encoding them, which may lead to uneven quality.
+Some parts of the encoder are serial, so it doesn't scale well with lots of cpus.
+
+Penalties for frame-base threading:
+To allow encoding of multiple frames in parallel, we have to ensure that any given macroblock uses motion vectors only from pieces of the reference frames that have been encoded already. This is usually not noticeable, but can matter for very fast upward motion.
+We have to commit to one frame type before starting on the frame. Thus scenecut detection must run during the lowres pre-motion-estimation along with B-adapt, which makes it faster but less accurate than re-encoding the whole frame.
+Ratecontrol gets delayed feedback, since it has to plan frame N before frame N-1 finishes.
+
+
+Benchmarks:
+cpu: 4x woodcrest 3GHz
+content: 480p
+
+x264 -B1000 -b2 -m1 -Anone
+threads  speed           psnr
+       old   new      old    new
+1:   1.000x 1.000x   0.000  0.000
+2:   1.168x 1.413x  -0.038 -0.007
+3:   1.208x 1.814x  -0.064 -0.005
+4:   1.293x 2.329x  -0.095 -0.006
+5:          2.526x         -0.007
+6:          2.658x         -0.001
+7:          2.723x         -0.018
+8:          2.712x         -0.019
+
+x264 -B1000 -b2 -m5
+threads  speed           psnr   
+       old   new      old    new
+1:   1.000x 1.000x   0.000  0.000
+2:   1.319x 1.517x  -0.036 -0.006
+3:   1.466x 2.013x  -0.068 -0.005
+4:   1.578x 2.741x  -0.101 -0.004
+5:          3.022x         -0.015
+6:          3.221x         -0.014
+7:          3.331x         -0.020
+8:          3.425x         -0.025
+
+x264 -B1000 -b2 -m6 -r3 -8 --b-rdo
+threads  speed           psnr   
+       old   new      old    new
+1:   1.000x 1.000x   0.000  0.000
+2:   1.531x 1.707x  -0.032 -0.006
+3:   1.866x 2.277x  -0.061 -0.005
+4:   2.097x 3.204x  -0.088 -0.006
+5:          3.468x         -0.013
+6:          3.629x         -0.010
+7:          3.716x         -0.014
+8:          3.745x         -0.018
+
diff --git a/encoder/analyse.c b/encoder/analyse.c

index b3ded41a268164f321edaf03716f1053daffd468..e854da8c1f900f64392722f44f82dfe283a62ad7 100644 (file)
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -26,6 +26,7 @@
  #include <string.h>
  #include <math.h>
  #include <limits.h>
+#include <unistd.h>
  
  #include "common/common.h"
  #include "macroblock.h"
@@ -219,27 +220,54 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
      /* II: Inter part P/B frame */
      if( h->sh.i_type != SLICE_TYPE_I )
      {
-        int i;
-        int i_fmv_range = h->param.analyse.i_mv_range - 16;
+        int i, j;
+        int i_fmv_range = 4 * h->param.analyse.i_mv_range;
+        int i_fpel_border = 5; // 3 for hex search, 2 for subpel, ignores subme7 & bime
  
          /* Calculate max allowed MV range */
  #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range )
          h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
          h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
-        h->mb.mv_min_fpel[0] = CLIP_FMV( -16*h->mb.i_mb_x - 8 );
-        h->mb.mv_max_fpel[0] = CLIP_FMV( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 8 );
-        h->mb.mv_min_spel[0] = 4*( h->mb.mv_min_fpel[0] - 16 );
-        h->mb.mv_max_spel[0] = 4*( h->mb.mv_max_fpel[0] + 16 );
+        h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
+        h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
+        h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
+        h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
          if( h->mb.i_mb_x == 0)
          {
              int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
              int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
+            int thread_mvy_range = i_fmv_range;
+
+            if( h->param.i_threads > 1 )
+            {
+                int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
+                int thresh = pix_y + h->param.analyse.i_mv_range_thread;
+                for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
+                {
+                    x264_frame_t **fref = i ? h->fref1 : h->fref0;
+                    int i_ref = i ? h->i_ref1 : h->i_ref0;
+                    for( j=0; j<i_ref; j++ )
+                    {
+                        // could use a condition variable or the like, but
+                        // this way is faster at least on LinuxThreads.
+                        while( fref[j]->i_lines_completed < thresh )
+                            usleep(100);
+                        thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->i_lines_completed - pix_y );
+                    }
+                }
+                if( h->param.b_deterministic )
+                    thread_mvy_range = h->param.analyse.i_mv_range_thread;
+                if( h->mb.b_interlaced )
+                    thread_mvy_range >>= 1;
+            }
+
              h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
              h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
-            h->mb.mv_min_fpel[1] = CLIP_FMV( -16*mb_y - 8 );
-            h->mb.mv_max_fpel[1] = CLIP_FMV( 16*( mb_height - mb_y - 1 ) + 8 );
-            h->mb.mv_min_spel[1] = 4*( h->mb.mv_min_fpel[1] - 16 );
-            h->mb.mv_max_spel[1] = 4*( h->mb.mv_max_fpel[1] + 16 );
+            h->mb.mv_min_spel[1] = CLIP_FMV( h->mb.mv_min[1] );
+            h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
+            h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
+            h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
+            h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
          }
  #undef CLIP_FMV
  
@@ -943,6 +971,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
          {
              h->mb.i_type = P_SKIP;
              x264_analyse_update_cache( h, a );
+            assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
              return;
          }
  
@@ -960,6 +989,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
      }
  
      x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
+    assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
  
      h->mb.i_type = P_L0;
      if( a->b_mbrd && a->l0.i_ref == 0
@@ -2043,7 +2073,10 @@ void x264_macroblock_analyse( x264_t *h )
          analysis.b_try_pskip = 0;
          if( h->param.analyse.b_fast_pskip )
          {
-            if( h->param.analyse.i_subpel_refine >= 3 )
+            if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
+                // FIXME don't need to check this if the reference frame is done
+                {}
+            else if( h->param.analyse.i_subpel_refine >= 3 )
                  analysis.b_try_pskip = 1;
              else if( h->mb.i_mb_type_left == P_SKIP ||
                       h->mb.i_mb_type_top == P_SKIP ||
@@ -2058,6 +2091,7 @@ void x264_macroblock_analyse( x264_t *h )
          {
              h->mb.i_type = P_SKIP;
              h->mb.i_partition = D_16x16;
+            assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
          }
          else
          {
@@ -2244,6 +2278,7 @@ void x264_macroblock_analyse( x264_t *h )
              h->mb.i_type = i_type;
              h->stat.frame.i_intra_cost += i_intra_cost;
              h->stat.frame.i_inter_cost += i_cost;
+            h->stat.frame.i_mbs_analysed++;
  
              if( h->mb.i_subpel_refine >= 7 )
              {
@@ -2658,6 +2693,32 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
                  break;
              }
      }
+
+#ifndef NDEBUG
+    if( h->param.i_threads > 1 && !IS_INTRA(h->mb.i_type) )
+    {
+        int l;
+        for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
+        {
+            int completed;
+            int ref = h->mb.cache.ref[l][x264_scan8[0]];
+            if( ref < 0 )
+                continue;
+            completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->i_lines_completed;
+            if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
+            {
+                fprintf(stderr, "mb type: %d \n", h->mb.i_type);
+                fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
+                                h->mb.cache.mv[l][x264_scan8[15]][0],
+                                h->mb.cache.mv[l][x264_scan8[15]][1] );
+                fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
+                fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
+                fprintf(stderr, "completed: %d \n", completed );
+                assert(0);
+            }
+        }
+    }
+#endif
  }
  
  #include "slicetype.c"
diff --git a/encoder/encoder.c b/encoder/encoder.c

index 9fae325bd024c3d5d471997b0ac0040b3a27ea20..84aa9325f7c8d273491d313f7daba6a6e87011d0 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -61,6 +61,16 @@ static int64_t i_mtime_filter = 0;
  
  #define NALU_OVERHEAD 5 // startcode + NAL type costs 5 bytes per frame
  
+static x264_frame_t *x264_frame_get( x264_frame_t **list ); //FIXME move
+static void x264_frame_put( x264_frame_t **list, x264_frame_t *frame );
+static void x264_frame_push( x264_frame_t **list, x264_frame_t *frame );
+static void x264_frame_put_unused( x264_t *h, x264_frame_t *frame );
+static x264_frame_t *x264_frame_get_unused( x264_t *h );
+
+static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
+                                    x264_nal_t **pp_nal, int *pi_nal,
+                                    x264_picture_t *pic_out );
+
  /****************************************************************************
   *
   ******************************* x264 libs **********************************
@@ -101,7 +111,7 @@ static void x264_frame_dump( x264_t *h, x264_frame_t *fr, char *name )
  /* Fill "default" values */
  static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
                                      x264_sps_t *sps, x264_pps_t *pps,
-                                    int i_type, int i_idr_pic_id, int i_frame, int i_qp )
+                                    int i_idr_pic_id, int i_frame, int i_qp )
  {
      x264_param_t *param = &h->param;
      int i;
@@ -110,7 +120,6 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
      sh->sps = sps;
      sh->pps = pps;
  
-    sh->i_type      = i_type;
      sh->i_first_mb  = 0;
      sh->i_last_mb   = h->sps->i_mb_width * h->sps->i_mb_height;
      sh->i_pps_id    = pps->i_id;
@@ -347,16 +356,24 @@ static int x264_validate_parameters( x264_t *h )
      }
  
      if( h->param.i_threads == 0 )
-        h->param.i_threads = x264_cpu_num_processors();
-    h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_SLICE_MAX );
-    h->param.i_threads = X264_MIN( h->param.i_threads, (h->param.i_height + 15) >> (4 + h->param.b_interlaced) );
-#ifndef HAVE_PTHREAD
+        h->param.i_threads = x264_cpu_num_processors() * 3/2;
+    h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
+    h->param.i_threads = X264_MIN( h->param.i_threads, 1 + (h->param.i_height >> h->param.b_interlaced) / (X264_THREAD_HEIGHT + 16) ); // FIXME exact limit?
      if( h->param.i_threads > 1 )
      {
+#ifndef HAVE_PTHREAD
          x264_log( h, X264_LOG_WARNING, "not compiled with pthread support!\n");
-        x264_log( h, X264_LOG_WARNING, "multislicing anyway, but you won't see any speed gain.\n" );
-    }
+        h->param.i_threads = 1;
+#else
+        if( h->param.analyse.i_me_method == X264_ME_ESA )
+        {
+            x264_log( h, X264_LOG_WARNING, "threads are not yet compatible with ESA\n");
+            h->param.analyse.i_me_method = X264_ME_UMH;
+        }
+        if( h->param.i_scenecut_threshold >= 0 )
+            h->param.b_pre_scenecut = 1;
  #endif
+    }
  
      if( h->param.b_interlaced )
      {
@@ -476,6 +493,29 @@ static int x264_validate_parameters( x264_t *h )
              h->param.analyse.i_direct_8x8_inference = l->direct8x8;
      }
  
+    if( h->param.i_threads > 1 )
+    {
+        int r = h->param.analyse.i_mv_range_thread;
+        int r2;
+        if( r <= 0 )
+        {
+            // half of the available space is reserved and divided evenly among the threads,
+            // the rest is allocated to whichever thread is far enough ahead to use it.
+            // reserving more space increases quality for some videos, but costs more time
+            // in thread synchronization.
+            int max_range = (h->param.i_height + X264_THREAD_HEIGHT) / h->param.i_threads - X264_THREAD_HEIGHT;
+            r = max_range / 2;
+        }
+        r = X264_MAX( r, h->param.analyse.i_me_range );
+        r = X264_MIN( r, h->param.analyse.i_mv_range );
+        // round up to use the whole mb row
+        r2 = (r & ~15) + ((-X264_THREAD_HEIGHT) & 15);
+        if( r2 < r )
+            r2 += 16;
+        x264_log( h, X264_LOG_DEBUG, "using mv_range_thread = %d\n", r2 );
+        h->param.analyse.i_mv_range_thread = r2;
+    }
+
      if( h->param.rc.f_qblur < 0 )
          h->param.rc.f_qblur = 0;
      if( h->param.rc.f_complexity_blur < 0 )
@@ -577,12 +617,6 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
      x264_reduce_fraction( &h->param.i_fps_num, &h->param.i_fps_den );
  
      /* Init x264_t */
-    h->out.i_nal = 0;
-    h->out.i_bitstream = X264_MAX( 1000000, h->param.i_width * h->param.i_height * 1.7
-        * ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.5, h->param.rc.i_qp_min )
-          : pow( 0.5, h->param.rc.i_qp_constant ) * X264_MAX( 1, h->param.rc.f_ip_factor )));
-    h->out.p_bitstream = x264_malloc( h->out.i_bitstream );
-
      h->i_frame = 0;
      h->i_frame_num = 0;
      h->i_idr_pic_id = 0;
@@ -600,7 +634,7 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
      h->mb.i_mb_count = h->sps->i_mb_width * h->sps->i_mb_height;
  
      /* Init frames. */
-    h->frames.i_delay = h->param.i_bframe;
+    h->frames.i_delay = h->param.i_bframe + h->param.i_threads - 1;
      h->frames.i_max_ref0 = h->param.i_frame_reference;
      h->frames.i_max_ref1 = h->sps->vui.i_num_reorder_frames;
      h->frames.i_max_dpb  = h->sps->vui.i_max_dec_frame_buffering + 1;
@@ -609,25 +643,6 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
            || h->param.rc.i_rc_method == X264_RC_CRF
            || h->param.b_bframe_adaptive );
  
-    for( i = 0; i < X264_BFRAME_MAX + 3; i++ )
-    {
-        h->frames.current[i] = NULL;
-        h->frames.next[i]    = NULL;
-        h->frames.unused[i]  = NULL;
-    }
-    for( i = 0; i < 1 + h->frames.i_delay; i++ )
-    {
-        h->frames.unused[i] =  x264_frame_new( h );
-        if( !h->frames.unused[i] )
-            return NULL;
-    }
-    for( i = 0; i < h->frames.i_max_dpb; i++ )
-    {
-        h->frames.reference[i] = x264_frame_new( h );
-        if( !h->frames.reference[i] )
-            return NULL;
-    }
-    h->frames.reference[h->frames.i_max_dpb] = NULL;
      h->frames.i_last_idr = - h->param.i_keyint_max;
      h->frames.i_input    = 0;
      h->frames.last_nonb  = NULL;
@@ -635,10 +650,6 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
      h->i_ref0 = 0;
      h->i_ref1 = 0;
  
-    h->fdec = h->frames.reference[0];
-
-    if( x264_macroblock_cache_init( h ) < 0 )
-        return NULL;
      x264_rdo_init( );
  
      /* init CPU functions */
@@ -658,10 +669,6 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
  
      mbcmp_init( h );
  
-    /* rate control */
-    if( x264_ratecontrol_new( h ) < 0 )
-        return NULL;
-
      x264_log( h, X264_LOG_INFO, "using cpu capabilities %s%s%s%s%s%s\n",
               param->cpu&X264_CPU_MMX ? "MMX " : "",
               param->cpu&X264_CPU_MMXEXT ? "MMXEXT " : "",
@@ -670,11 +677,29 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
               param->cpu&X264_CPU_3DNOW ? "3DNow! " : "",
               param->cpu&X264_CPU_ALTIVEC ? "Altivec " : "" );
  
+    h->out.i_nal = 0;
+    h->out.i_bitstream = X264_MAX( 1000000, h->param.i_width * h->param.i_height * 1.7
+        * ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.5, h->param.rc.i_qp_min )
+          : pow( 0.5, h->param.rc.i_qp_constant ) * X264_MAX( 1, h->param.rc.f_ip_factor )));
+
      h->thread[0] = h;
      h->i_thread_num = 0;
      for( i = 1; i < h->param.i_threads; i++ )
          h->thread[i] = x264_malloc( sizeof(x264_t) );
  
+    for( i = 0; i < h->param.i_threads; i++ )
+    {
+        if( i > 0 )
+            *h->thread[i] = *h;
+        h->thread[i]->fdec = x264_frame_get_unused( h );
+        h->thread[i]->out.p_bitstream = x264_malloc( h->out.i_bitstream );
+        if( x264_macroblock_cache_init( h->thread[i] ) < 0 )
+            return NULL;
+    }
+
+    if( x264_ratecontrol_new( h ) < 0 )
+        return NULL;
+
  #ifdef DEBUG_DUMP_FRAME
      {
          /* create or truncate the reconstructed video file */
@@ -788,14 +813,14 @@ int x264_encoder_headers( x264_t *h, x264_nal_t **pp_nal, int *pi_nal )
  }
  
  
-static void x264_frame_put( x264_frame_t *list[X264_BFRAME_MAX], x264_frame_t *frame )
+static void x264_frame_put( x264_frame_t **list, x264_frame_t *frame )
  {
      int i = 0;
      while( list[i] ) i++;
      list[i] = frame;
  }
  
-static void x264_frame_push( x264_frame_t *list[X264_BFRAME_MAX], x264_frame_t *frame )
+static void x264_frame_push( x264_frame_t **list, x264_frame_t *frame )
  {
      int i = 0;
      while( list[i] ) i++;
@@ -804,16 +829,38 @@ static void x264_frame_push( x264_frame_t *list[X264_BFRAME_MAX], x264_frame_t *
      list[0] = frame;
  }
  
-static x264_frame_t *x264_frame_get( x264_frame_t *list[X264_BFRAME_MAX+1] )
+static x264_frame_t *x264_frame_get( x264_frame_t **list )
  {
      x264_frame_t *frame = list[0];
      int i;
      for( i = 0; list[i]; i++ )
          list[i] = list[i+1];
+    assert(frame);
      return frame;
  }
  
-static void x264_frame_sort( x264_frame_t *list[X264_BFRAME_MAX+1], int b_dts )
+static void x264_frame_put_unused( x264_t *h, x264_frame_t *frame )
+{
+    assert( frame->i_reference_count > 0 );
+    frame->i_reference_count--;
+    if( frame->i_reference_count == 0 )
+        x264_frame_put( h->frames.unused, frame );
+    assert( h->frames.unused[ sizeof(h->frames.unused) / sizeof(*h->frames.unused) - 1 ] == NULL );
+}
+
+static x264_frame_t *x264_frame_get_unused( x264_t *h )
+{
+    x264_frame_t *frame;
+    if( h->frames.unused[0] )
+        frame = x264_frame_get( h->frames.unused );
+    else
+        frame = x264_frame_new( h );
+    assert( frame->i_reference_count == 0 );
+    frame->i_reference_count = 1;
+    return frame;
+}
+
+static void x264_frame_sort( x264_frame_t **list, int b_dts )
  {
      int i, b_ok;
      do {
@@ -835,7 +882,7 @@ static void x264_frame_sort( x264_frame_t *list[X264_BFRAME_MAX+1], int b_dts )
  #define x264_frame_sort_dts(list) x264_frame_sort(list, 1)
  #define x264_frame_sort_pts(list) x264_frame_sort(list, 0)
  
-static inline void x264_reference_build_list( x264_t *h, int i_poc, int i_slice_type )
+static inline void x264_reference_build_list( x264_t *h, int i_poc )
  {
      int i;
      int b_ok;
@@ -843,18 +890,15 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc, int i_slice_
      /* build ref list 0/1 */
      h->i_ref0 = 0;
      h->i_ref1 = 0;
-    for( i = 1; i < h->frames.i_max_dpb; i++ )
+    for( i = 0; h->frames.reference[i]; i++ )
      {
-        if( h->frames.reference[i]->i_poc >= 0 )
+        if( h->frames.reference[i]->i_poc < i_poc )
          {
-            if( h->frames.reference[i]->i_poc < i_poc )
-            {
-                h->fref0[h->i_ref0++] = h->frames.reference[i];
-            }
-            else if( h->frames.reference[i]->i_poc > i_poc )
-            {
-                h->fref1[h->i_ref1++] = h->frames.reference[i];
-            }
+            h->fref0[h->i_ref0++] = h->frames.reference[i];
+        }
+        else if( h->frames.reference[i]->i_poc > i_poc )
+        {
+            h->fref1[h->i_ref1++] = h->frames.reference[i];
          }
      }
  
@@ -891,7 +935,7 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc, int i_slice_
       * We use POC, but check whether explicit reordering is needed */
      h->b_ref_reorder[0] =
      h->b_ref_reorder[1] = 0;
-    if( i_slice_type == SLICE_TYPE_P )
+    if( h->sh.i_type == SLICE_TYPE_P )
      {
          for( i = 0; i < h->i_ref0 - 1; i++ )
              if( h->fref0[i]->i_frame_num < h->fref0[i+1]->i_frame_num )
@@ -909,14 +953,52 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc, int i_slice_
      h->mb.pic.i_fref[1] = h->i_ref1;
  }
  
-static inline void x264_fdec_deblock( x264_t *h )
+static void x264_fdec_filter_row( x264_t *h, int mb_y )
  {
-    /* apply deblocking filter to the current decoded picture */
-    if( !h->sh.i_disable_deblocking_filter_idc )
+    /* mb_y is the mb to be encoded next, not the mb to be filtered here */
+    int b_hpel = h->fdec->b_kept_as_ref;
+    int b_deblock = !h->sh.i_disable_deblocking_filter_idc;
+    int b_end = mb_y == h->sps->i_mb_height;
+    int min_y = mb_y - (1 << h->sh.b_mbaff);
+#ifndef DEBUG_DUMP_FRAME
+    b_deblock &= b_hpel;
+#endif
+    if( mb_y & h->sh.b_mbaff )
+        return;
+    if( min_y < 0 )
+        return;
+
+    if( !b_end )
+    {
+        int i, j;
+        for( j=0; j<=h->sh.b_mbaff; j++ )
+            for( i=0; i<3; i++ )
+            {
+                memcpy( h->mb.intra_border_backup[j][i],
+                        h->fdec->plane[i] + ((mb_y*16 >> !!i) + j - 1 - h->sh.b_mbaff) * h->fdec->i_stride[i],
+                        h->sps->i_mb_width*16 >> !!i );
+            }
+    }
+
+    if( b_deblock )
      {
-        TIMER_START( i_mtime_filter );
-        x264_frame_deblocking_filter( h, h->sh.i_type );
-        TIMER_STOP( i_mtime_filter );
+        int max_y = b_end ? h->sps->i_mb_height : mb_y;
+        int y;
+        for( y = min_y; y < max_y; y += (1 << h->sh.b_mbaff) )
+            x264_frame_deblock_row( h, y );
+    }
+
+    if( b_hpel )
+    {
+        x264_frame_expand_border( h, h->fdec, min_y, b_end );
+        x264_frame_filter( h->param.cpu, h->fdec, h->sh.b_mbaff, min_y, b_end );
+        x264_frame_expand_border_filtered( h, h->fdec, min_y, b_end );
+    }
+
+    if( h->param.i_threads > 1 )
+    {
+        /* this must be an atomic store. a 32bit int should be so on sane architectures. */
+        h->fdec->i_lines_completed = mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << h->sh.b_mbaff));
      }
  }
  
@@ -924,16 +1006,18 @@ static inline void x264_reference_update( x264_t *h )
  {
      int i;
  
-    x264_fdec_deblock( h );
-
-    /* expand border */
-    x264_frame_expand_border( h, h->fdec );
+    if( h->fdec->i_frame >= 0 )
+        h->i_frame++;
  
-    /* create filtered images */
-    x264_frame_filter( h->param.cpu, h->fdec, h->sh.b_mbaff );
-
-    /* expand border of filtered images */
-    x264_frame_expand_border_filtered( h, h->fdec );
+    if( !h->fdec->b_kept_as_ref )
+    {
+        if( h->param.i_threads > 1 )
+        {
+            x264_frame_put_unused( h, h->fdec );
+            h->fdec = x264_frame_get_unused( h );
+        }
+        return;
+    }
  
      /* move lowres copy of the image to the ref frame */
      for( i = 0; i < 4; i++)
@@ -947,39 +1031,33 @@ static inline void x264_reference_update( x264_t *h )
          h->frames.last_nonb = h->fdec;
  
      /* move frame in the buffer */
-    h->fdec = h->frames.reference[h->frames.i_max_dpb-1];
-    for( i = h->frames.i_max_dpb-1; i > 0; i-- )
-    {
-        h->frames.reference[i] = h->frames.reference[i-1];
-    }
-    h->frames.reference[0] = h->fdec;
+    x264_frame_put( h->frames.reference, h->fdec );
+    if( h->frames.reference[h->frames.i_max_dpb] )
+        x264_frame_put_unused( h, x264_frame_get( h->frames.reference ) );
+    h->fdec = x264_frame_get_unused( h );
  }
  
  static inline void x264_reference_reset( x264_t *h )
  {
-    int i;
-
-    /* reset ref pictures */
-    for( i = 1; i < h->frames.i_max_dpb; i++ )
-    {
-        h->frames.reference[i]->i_poc = -1;
-    }
-    h->frames.reference[0]->i_poc = 0;
+    while( h->frames.reference[0] )
+        x264_frame_put_unused( h, x264_frame_get( h->frames.reference ) );
+    h->fdec->i_poc =
+    h->fenc->i_poc = 0;
  }
  
-static inline void x264_slice_init( x264_t *h, int i_nal_type, int i_slice_type, int i_global_qp )
+static inline void x264_slice_init( x264_t *h, int i_nal_type, int i_global_qp )
  {
      /* ------------------------ Create slice header  ----------------------- */
      if( i_nal_type == NAL_SLICE_IDR )
      {
-        x264_slice_header_init( h, &h->sh, h->sps, h->pps, i_slice_type, h->i_idr_pic_id, h->i_frame_num, i_global_qp );
+        x264_slice_header_init( h, &h->sh, h->sps, h->pps, h->i_idr_pic_id, h->i_frame_num, i_global_qp );
  
          /* increment id */
          h->i_idr_pic_id = ( h->i_idr_pic_id + 1 ) % 65536;
      }
      else
      {
-        x264_slice_header_init( h, &h->sh, h->sps, h->pps, i_slice_type, -1, h->i_frame_num, i_global_qp );
+        x264_slice_header_init( h, &h->sh, h->sps, h->pps, -1, h->i_frame_num, i_global_qp );
  
          /* always set the real higher num of ref frame used */
          h->sh.b_num_ref_idx_override = 1;
@@ -1036,9 +1114,11 @@ static int x264_slice_write( x264_t *h )
      {
          const int i_mb_y = mb_xy / h->sps->i_mb_width;
          const int i_mb_x = mb_xy % h->sps->i_mb_width;
-
          int mb_spos = bs_pos(&h->out.bs);
  
+        if( i_mb_x == 0 )
+            x264_fdec_filter_row( h, i_mb_y );
+
          /* load cache */
          x264_macroblock_cache_load( h, i_mb_x, i_mb_y );
  
@@ -1168,7 +1248,34 @@ static int x264_slice_write( x264_t *h )
      return 0;
  }
  
-static inline int x264_slices_write( x264_t *h )
+static void x264_thread_sync_context( x264_t *dst, x264_t *src )
+{
+    x264_frame_t **f;
+    if( dst == src )
+        return;
+
+    // reference counting
+    for( f = src->frames.reference; *f; f++ )
+        (*f)->i_reference_count++;
+    for( f = dst->frames.reference; *f; f++ )
+        x264_frame_put_unused( src, *f );
+    src->fdec->i_reference_count++;
+    x264_frame_put_unused( src, dst->fdec );
+
+    // copy everything except the per-thread pointers and the constants.
+    memcpy( &dst->i_frame, &src->i_frame, offsetof(x264_t, mb.type) - offsetof(x264_t, i_frame) );
+    memcpy( &dst->mb.i_type, &src->mb.i_type, offsetof(x264_t, rc) - offsetof(x264_t, mb.i_type) );
+    dst->stat = src->stat;
+}
+
+static void x264_thread_sync_stat( x264_t *dst, x264_t *src )
+{
+    if( dst == src )
+        return;
+    memcpy( &dst->stat.i_slice_count, &src->stat.i_slice_count, sizeof(dst->stat) - sizeof(dst->stat.frame) );
+}
+
+static int x264_slices_write( x264_t *h )
  {
      int i_frame_size;
  
@@ -1177,64 +1284,9 @@ static inline int x264_slices_write( x264_t *h )
          x264_visualize_init( h );
  #endif
  
-    if( h->param.i_threads == 1 )
-    {
-        x264_ratecontrol_threads_start( h );
-        x264_slice_write( h );
-        i_frame_size = h->out.nal[h->out.i_nal-1].i_payload;
-    }
-    else
-    {
-        int i_nal = h->out.i_nal;
-        int i_bs_size = h->out.i_bitstream / h->param.i_threads;
-        int i;
-        /* duplicate contexts */
-        for( i = 0; i < h->param.i_threads; i++ )
-        {
-            x264_t *t = h->thread[i];
-            int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
-            int mb_width = h->sps->i_mb_width << h->sh.b_mbaff;
-            if( i > 0 )
-            {
-                memcpy( t, h, sizeof(x264_t) );
-                t->out.p_bitstream += i*i_bs_size;
-                bs_init( &t->out.bs, t->out.p_bitstream, i_bs_size );
-                t->i_thread_num = i;
-            }
-            t->sh.i_first_mb = (i    * mb_height / h->param.i_threads) * mb_width;
-            t->sh.i_last_mb = ((i+1) * mb_height / h->param.i_threads) * mb_width;
-            t->out.i_nal = i_nal + i;
-        }
-        x264_ratecontrol_threads_start( h );
-
-        /* dispatch */
-#ifdef HAVE_PTHREAD
-        {
-            pthread_t handles[X264_SLICE_MAX];
-            for( i = 0; i < h->param.i_threads; i++ )
-                pthread_create( &handles[i], NULL, (void*)x264_slice_write, (void*)h->thread[i] );
-            for( i = 0; i < h->param.i_threads; i++ )
-                pthread_join( handles[i], NULL );
-        }
-#else
-        for( i = 0; i < h->param.i_threads; i++ )
-            x264_slice_write( h->thread[i] );
-#endif
-
-        /* merge contexts */
-        i_frame_size = h->out.nal[i_nal].i_payload;
-        for( i = 1; i < h->param.i_threads; i++ )
-        {
-            int j;
-            x264_t *t = h->thread[i];
-            h->out.nal[i_nal+i] = t->out.nal[i_nal+i];
-            i_frame_size += t->out.nal[i_nal+i].i_payload;
-            // all entries in stat.frame are ints
-            for( j = 0; j < sizeof(h->stat.frame) / sizeof(int); j++ )
-                ((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
-        }
-        h->out.i_nal = i_nal + h->param.i_threads;
-    }
+    x264_slice_write( h );
+    i_frame_size = h->out.nal[h->out.i_nal-1].i_payload;
+    x264_fdec_filter_row( h, h->sps->i_mb_height );
  
  #if VISUALIZE
      if( h->param.b_visualize )
@@ -1244,7 +1296,8 @@ static inline int x264_slices_write( x264_t *h )
      }
  #endif
  
-    return i_frame_size;
+    h->out.i_frame_size = i_frame_size;
+    return 0;
  }
  
  /****************************************************************************
@@ -1265,29 +1318,45 @@ int     x264_encoder_encode( x264_t *h,
                               x264_picture_t *pic_in,
                               x264_picture_t *pic_out )
  {
-    x264_frame_t   *frame_psnr = h->fdec; /* just to keep the current decoded frame for psnr calculation */
+    x264_t *thread_current, *thread_prev, *thread_oldest;
      int     i_nal_type;
      int     i_nal_ref_idc;
-    int     i_slice_type;
-    int     i_frame_size;
-
-    int i;
  
      int   i_global_qp;
  
-    char psz_message[80];
+    if( h->param.i_threads > 1)
+    {
+        int i = ++h->i_thread_phase;
+        int t = h->param.i_threads;
+        thread_current = h->thread[ i%t ];
+        thread_prev    = h->thread[ (i-1)%t ];
+        thread_oldest  = h->thread[ (i+1)%t ];
+        x264_thread_sync_context( thread_current, thread_prev );
+        x264_thread_sync_ratecontrol( thread_current, thread_prev, thread_oldest );
+        h = thread_current;
+//      fprintf(stderr, "current: %p  prev: %p  oldest: %p \n", thread_current, thread_prev, thread_oldest);
+    }
+    else
+    {
+        thread_current =
+        thread_prev    =
+        thread_oldest  = h;
+    }
+
+    // ok to call this before encoding any frames, since the initial values of fdec have b_kept_as_ref=0
+    x264_reference_update( h );
+    h->fdec->i_lines_completed = -1;
  
      /* no data out */
      *pi_nal = 0;
      *pp_nal = NULL;
  
-
      /* ------------------- Setup new frame from picture -------------------- */
      TIMER_START( i_mtime_encode_frame );
      if( pic_in != NULL )
      {
          /* 1: Copy the picture to a frame and move it to a buffer */
-        x264_frame_t *fenc = x264_frame_get( h->frames.unused );
+        x264_frame_t *fenc = x264_frame_get_unused( h );
  
          x264_frame_copy_picture( h, fenc, pic_in );
  
@@ -1302,7 +1371,7 @@ int     x264_encoder_encode( x264_t *h,
          if( h->frames.b_have_lowres )
              x264_frame_init_lowres( h->param.cpu, fenc );
  
-        if( h->frames.i_input <= h->frames.i_delay )
+        if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads )
          {
              /* Nothing yet to encode */
              /* waiting for filling bframe buffer */
@@ -1316,7 +1385,10 @@ int     x264_encoder_encode( x264_t *h,
          int bframes = 0;
          /* 2: Select frame types */
          if( h->frames.next[0] == NULL )
+        {
+            x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
              return 0;
+        }
  
          x264_slicetype_decide( h );
  
@@ -1365,31 +1437,31 @@ do_encode:
  
          i_nal_type    = NAL_SLICE_IDR;
          i_nal_ref_idc = NAL_PRIORITY_HIGHEST;
-        i_slice_type = SLICE_TYPE_I;
+        h->sh.i_type = SLICE_TYPE_I;
      }
      else if( h->fenc->i_type == X264_TYPE_I )
      {
          i_nal_type    = NAL_SLICE;
          i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/
-        i_slice_type = SLICE_TYPE_I;
+        h->sh.i_type = SLICE_TYPE_I;
      }
      else if( h->fenc->i_type == X264_TYPE_P )
      {
          i_nal_type    = NAL_SLICE;
          i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/
-        i_slice_type = SLICE_TYPE_P;
+        h->sh.i_type = SLICE_TYPE_P;
      }
      else if( h->fenc->i_type == X264_TYPE_BREF )
      {
          i_nal_type    = NAL_SLICE;
          i_nal_ref_idc = NAL_PRIORITY_HIGH; /* maybe add MMCO to forget it? -> low */
-        i_slice_type = SLICE_TYPE_B;
+        h->sh.i_type = SLICE_TYPE_B;
      }
      else    /* B frame */
      {
          i_nal_type    = NAL_SLICE;
          i_nal_ref_idc = NAL_PRIORITY_DISPOSABLE;
-        i_slice_type = SLICE_TYPE_B;
+        h->sh.i_type = SLICE_TYPE_B;
      }
  
      h->fdec->i_poc =
@@ -1397,28 +1469,28 @@ do_encode:
      h->fdec->i_type = h->fenc->i_type;
      h->fdec->i_frame = h->fenc->i_frame;
      h->fenc->b_kept_as_ref =
-    h->fdec->b_kept_as_ref = i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE;
+    h->fdec->b_kept_as_ref = i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE && h->param.i_keyint_max > 1;
  
  
  
      /* ------------------- Init                ----------------------------- */
      /* build ref list 0/1 */
-    x264_reference_build_list( h, h->fdec->i_poc, i_slice_type );
+    x264_reference_build_list( h, h->fdec->i_poc );
  
      /* Init the rate control */
-    x264_ratecontrol_start( h, i_slice_type, h->fenc->i_qpplus1 );
+    x264_ratecontrol_start( h, h->fenc->i_qpplus1 );
      i_global_qp = x264_ratecontrol_qp( h );
  
      pic_out->i_qpplus1 =
      h->fdec->i_qpplus1 = i_global_qp + 1;
  
-    if( i_slice_type == SLICE_TYPE_B )
+    if( h->sh.i_type == SLICE_TYPE_B )
          x264_macroblock_bipred_init( h );
  
      /* ------------------------ Create slice header  ----------------------- */
-    x264_slice_init( h, i_nal_type, i_slice_type, i_global_qp );
+    x264_slice_init( h, i_nal_type, i_global_qp );
  
-    if( h->fenc->b_kept_as_ref )
+    if( i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE )
          h->i_frame_num++;
  
      /* ---------------------- Write the bitstream -------------------------- */
@@ -1429,11 +1501,11 @@ do_encode:
      if(h->param.b_aud){
          int pic_type;
  
-        if(i_slice_type == SLICE_TYPE_I)
+        if(h->sh.i_type == SLICE_TYPE_I)
              pic_type = 0;
-        else if(i_slice_type == SLICE_TYPE_P)
+        else if(h->sh.i_type == SLICE_TYPE_P)
              pic_type = 1;
-        else if(i_slice_type == SLICE_TYPE_B)
+        else if(h->sh.i_type == SLICE_TYPE_B)
              pic_type = 2;
          else
              pic_type = 7;
@@ -1470,13 +1542,20 @@ do_encode:
      }
  
      /* Write frame */
-    i_frame_size = x264_slices_write( h );
+    if( h->param.i_threads > 1 )
+    {
+        pthread_create( &h->thread_handle, NULL, (void*)x264_slices_write, h );
+        h->b_thread_active = 1;
+    }
+    else
+        x264_slices_write( h );
  
      /* restore CPU state (before using float again) */
      x264_cpu_restore( h->param.cpu );
  
-    if( i_slice_type == SLICE_TYPE_P && !h->param.rc.b_stat_read 
-        && h->param.i_scenecut_threshold >= 0 )
+    if( h->sh.i_type == SLICE_TYPE_P && !h->param.rc.b_stat_read 
+        && h->param.i_scenecut_threshold >= 0
+        && !h->param.b_pre_scenecut )
      {
          const int *mbs = h->stat.frame.i_mb_count;
          int i_mb_i = mbs[I_16x16] + mbs[I_8x8] + mbs[I_4x4];
@@ -1497,8 +1576,8 @@ do_encode:
  
          /* macroblock_analyse() doesn't further analyse skipped mbs,
           * so we have to guess their cost */
-        if( i_mb_s < i_mb )
-            i_intra_cost = i_intra_cost * i_mb / (i_mb - i_mb_s);
+        if( h->stat.frame.i_mbs_analysed > 0 )
+            i_intra_cost = i_intra_cost * i_mb / h->stat.frame.i_mbs_analysed;
  
          if( i_gop_size < h->param.i_keyint_min / 4 )
              f_bias = f_thresh_min / 4;
@@ -1514,15 +1593,15 @@ do_encode:
          f_bias = X264_MIN( f_bias, 1.0 );
  
          /* Bad P will be reencoded as I */
-        if( i_mb_s < i_mb &&
+        if( h->stat.frame.i_mbs_analysed > 0 &&
              i_inter_cost >= (1.0 - f_bias) * i_intra_cost )
          {
              int b;
  
-            x264_log( h, X264_LOG_DEBUG, "scene cut at %d Icost:%.0f Pcost:%.0f ratio:%.3f bias=%.3f lastIDR:%d (I:%d P:%d S:%d)\n",
+            x264_log( h, X264_LOG_DEBUG, "scene cut at %d Icost:%.0f Pcost:%.0f ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d smb:%d)\n",
                        h->fenc->i_frame,
                        (double)i_intra_cost, (double)i_inter_cost,
-                      (double)i_inter_cost / i_intra_cost,
+                      1. - (double)i_inter_cost / i_intra_cost,
                        f_bias, i_gop_size,
                        i_mb_i, i_mb_p, i_mb_s );
  
@@ -1552,8 +1631,6 @@ do_encode:
              /* Do IDR if needed */
              else if( i_gop_size >= h->param.i_keyint_min )
              {
-                x264_frame_t *tmp;
-
                  /* Reset */
                  h->i_frame_num = 0;
  
@@ -1562,8 +1639,8 @@ do_encode:
                  h->fenc->i_poc = 0;
  
                  /* Put enqueued frames back in the pool */
-                while( (tmp = x264_frame_get( h->frames.current ) ) != NULL )
-                    x264_frame_put( h->frames.next, tmp );
+                while( h->frames.current[0] )
+                    x264_frame_put( h->frames.next, x264_frame_get( h->frames.current ) );
                  x264_frame_sort_pts( h->frames.next );
              }
              else
@@ -1574,14 +1651,39 @@ do_encode:
          }
      }
  
+    x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
+    return 0;
+}
+
+static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
+                                    x264_nal_t **pp_nal, int *pi_nal,
+                                    x264_picture_t *pic_out )
+{
+    int i;
+    char psz_message[80];
+
+    if( h->b_thread_active )
+    {
+        pthread_join( h->thread_handle, NULL );
+        h->b_thread_active = 0;
+    }
+    if( !h->out.i_nal )
+    {
+        pic_out->i_type = X264_TYPE_AUTO;
+        return;
+    }
+
+    x264_frame_put_unused( thread_current, h->fenc );
+
      /* End bitstream, set output  */
      *pi_nal = h->out.i_nal;
      *pp_nal = h->out.nal;
+    h->out.i_nal = 0;
  
      /* Set output picture properties */
-    if( i_slice_type == SLICE_TYPE_I )
-        pic_out->i_type = i_nal_type == NAL_SLICE_IDR ? X264_TYPE_IDR : X264_TYPE_I;
-    else if( i_slice_type == SLICE_TYPE_P )
+    if( h->sh.i_type == SLICE_TYPE_I )
+        pic_out->i_type = h->i_nal_type == NAL_SLICE_IDR ? X264_TYPE_IDR : X264_TYPE_I;
+    else if( h->sh.i_type == SLICE_TYPE_P )
          pic_out->i_type = X264_TYPE_P;
      else
          pic_out->i_type = X264_TYPE_B;
@@ -1597,19 +1699,7 @@ do_encode:
  
      /* update rc */
      x264_cpu_restore( h->param.cpu );
-    x264_ratecontrol_end( h, i_frame_size * 8 );
-
-    /* handle references */
-    if( i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE && h->param.i_keyint_max > 1 )
-        x264_reference_update( h );
-#ifdef DEBUG_DUMP_FRAME
-    else
-        x264_fdec_deblock( h );
-#endif
-    x264_frame_put( h->frames.unused, h->fenc );
-
-    /* increase frame count */
-    h->i_frame++;
+    x264_ratecontrol_end( h, h->out.i_frame_size * 8 );
  
      /* restore CPU state (before using float again) */
      x264_cpu_restore( h->param.cpu );
@@ -1619,10 +1709,12 @@ do_encode:
      TIMER_STOP( i_mtime_encode_frame );
  
      /* ---------------------- Compute/Print statistics --------------------- */
+    x264_thread_sync_stat( h, h->thread[0] );
+
      /* Slice stat */
-    h->stat.i_slice_count[i_slice_type]++;
-    h->stat.i_slice_size[i_slice_type] += i_frame_size + NALU_OVERHEAD;
-    h->stat.i_slice_qp[i_slice_type] += i_global_qp;
+    h->stat.i_slice_count[h->sh.i_type]++;
+    h->stat.i_slice_size[h->sh.i_type] += h->out.i_frame_size + NALU_OVERHEAD;
+    h->stat.i_slice_qp[h->sh.i_type] += h->fdec->i_qpplus1 - 1;
  
      for( i = 0; i < X264_MBTYPE_MAX; i++ )
          h->stat.i_mb_count[h->sh.i_type][i] += h->stat.frame.i_mb_count[i];
@@ -1635,7 +1727,7 @@ do_encode:
          for( i = 0; i < 32; i++ )
              h->stat.i_mb_count_ref[h->sh.i_type][i] += h->stat.frame.i_mb_count_ref[i];
      }
-    if( i_slice_type == SLICE_TYPE_B )
+    if( h->sh.i_type == SLICE_TYPE_B )
      {
          h->stat.i_direct_frames[ h->sh.b_direct_spatial_mv_pred ] ++;
          if( h->mb.b_direct_auto_write )
@@ -1659,17 +1751,17 @@ do_encode:
          for( i=0; i<3; i++ )
          {
              sqe[i] = x264_pixel_ssd_wxh( &h->pixf,
-                         frame_psnr->plane[i], frame_psnr->i_stride[i],
+                         h->fdec->plane[i], h->fdec->i_stride[i],
                           h->fenc->plane[i], h->fenc->i_stride[i],
                           h->param.i_width >> !!i, h->param.i_height >> !!i );
          }
          x264_cpu_restore( h->param.cpu );
  
-        h->stat.i_sqe_global[i_slice_type] += sqe[0] + sqe[1] + sqe[2];
-        h->stat.f_psnr_average[i_slice_type] += x264_psnr( sqe[0] + sqe[1] + sqe[2], 3 * h->param.i_width * h->param.i_height / 2 );
-        h->stat.f_psnr_mean_y[i_slice_type] += x264_psnr( sqe[0], h->param.i_width * h->param.i_height );
-        h->stat.f_psnr_mean_u[i_slice_type] += x264_psnr( sqe[1], h->param.i_width * h->param.i_height / 4 );
-        h->stat.f_psnr_mean_v[i_slice_type] += x264_psnr( sqe[2], h->param.i_width * h->param.i_height / 4 );
+        h->stat.i_sqe_global[h->sh.i_type] += sqe[0] + sqe[1] + sqe[2];
+        h->stat.f_psnr_average[h->sh.i_type] += x264_psnr( sqe[0] + sqe[1] + sqe[2], 3 * h->param.i_width * h->param.i_height / 2 );
+        h->stat.f_psnr_mean_y[h->sh.i_type] += x264_psnr( sqe[0], h->param.i_width * h->param.i_height );
+        h->stat.f_psnr_mean_u[h->sh.i_type] += x264_psnr( sqe[1], h->param.i_width * h->param.i_height / 4 );
+        h->stat.f_psnr_mean_v[h->sh.i_type] += x264_psnr( sqe[2], h->param.i_width * h->param.i_height / 4 );
  
          snprintf( psz_message, 80, " PSNR Y:%5.2f U:%5.2f V:%5.2f",
                    x264_psnr( sqe[0], h->param.i_width * h->param.i_height ),
@@ -1681,10 +1773,10 @@ do_encode:
      {
          // offset by 2 pixels to avoid alignment of ssim blocks with dct blocks
          float ssim_y = x264_pixel_ssim_wxh( &h->pixf,
-                         frame_psnr->plane[0] + 2+2*frame_psnr->i_stride[0], frame_psnr->i_stride[0],
+                         h->fdec->plane[0] + 2+2*h->fdec->i_stride[0], h->fdec->i_stride[0],
                           h->fenc->plane[0] + 2+2*h->fenc->i_stride[0], h->fenc->i_stride[0],
                           h->param.i_width-2, h->param.i_height-2 );
-        h->stat.f_ssim_mean_y[i_slice_type] += ssim_y;
+        h->stat.f_ssim_mean_y[h->sh.i_type] += ssim_y;
          snprintf( psz_message + strlen(psz_message), 80 - strlen(psz_message),
                    " SSIM Y:%.5f", ssim_y );
      }
@@ -1692,17 +1784,21 @@ do_encode:
      
      x264_log( h, X264_LOG_DEBUG,
                    "frame=%4d QP=%i NAL=%d Slice:%c Poc:%-3d I:%-4d P:%-4d SKIP:%-4d size=%d bytes%s\n",
-              h->i_frame - 1,
-              i_global_qp,
-              i_nal_ref_idc,
-              i_slice_type == SLICE_TYPE_I ? 'I' : (i_slice_type == SLICE_TYPE_P ? 'P' : 'B' ),
-              frame_psnr->i_poc,
+              h->i_frame,
+              h->fdec->i_qpplus1 - 1,
+              h->i_nal_ref_idc,
+              h->sh.i_type == SLICE_TYPE_I ? 'I' : (h->sh.i_type == SLICE_TYPE_P ? 'P' : 'B' ),
+              h->fdec->i_poc,
                h->stat.frame.i_mb_count_i,
                h->stat.frame.i_mb_count_p,
                h->stat.frame.i_mb_count_skip,
-              i_frame_size,
+              h->out.i_frame_size,
                psz_message );
  
+    // keep stats all in one place
+    x264_thread_sync_stat( h->thread[0], h );
+    // for the use of the next frame
+    x264_thread_sync_stat( thread_current, h );
  
  #ifdef DEBUG_MB_TYPE
  {
@@ -1724,9 +1820,8 @@ do_encode:
  
  #ifdef DEBUG_DUMP_FRAME
      /* Dump reconstructed frame */
-    x264_frame_dump( h, frame_psnr, "fdec.yuv" );
+    x264_frame_dump( h, h->fdec, "fdec.yuv" );
  #endif
-    return 0;
  }
  
  /****************************************************************************
@@ -1740,6 +1835,13 @@ void    x264_encoder_close  ( x264_t *h )
      int64_t i_yuv_size = 3 * h->param.i_width * h->param.i_height / 2;
      int i;
  
+    for( i=0; i<h->param.i_threads; i++ )
+    {
+        // don't strictly have to wait for the other threads, but it's simpler than cancelling them
+        if( h->thread[i]->b_thread_active )
+            pthread_join( h->thread[i]->thread_handle, NULL );
+    }
+
  #ifdef DEBUG_BENCHMARK
      x264_log( h, X264_LOG_INFO,
                "analyse=%d(%lldms) encode=%d(%lldms) write=%d(%lldms) filter=%d(%lldms)\n",
@@ -1903,17 +2005,14 @@ void    x264_encoder_close  ( x264_t *h )
      }
  
      /* frames */
-    for( i = 0; i < X264_BFRAME_MAX + 3; i++ )
-    {
-        if( h->frames.current[i] ) x264_frame_delete( h->frames.current[i] );
-        if( h->frames.next[i] )    x264_frame_delete( h->frames.next[i] );
-        if( h->frames.unused[i] )  x264_frame_delete( h->frames.unused[i] );
-    }
-    /* ref frames */
-    for( i = 0; i < h->frames.i_max_dpb; i++ )
-    {
+    for( i = 0; h->frames.current[i]; i++ )
+        x264_frame_delete( h->frames.current[i] );
+    for( i = 0; h->frames.next[i]; i++ )
+        x264_frame_delete( h->frames.next[i] );
+    for( i = 0; h->frames.unused[i]; i++ )
+        x264_frame_delete( h->frames.unused[i] );
+    for( i = 0; h->frames.reference[i]; i++ )
          x264_frame_delete( h->frames.reference[i] );
-    }
  
      /* rc */
      x264_ratecontrol_delete( h );
@@ -1927,9 +2026,10 @@ void    x264_encoder_close  ( x264_t *h )
          free( h->param.rc.psz_rc_eq );
  
      x264_cqm_delete( h );
-    x264_macroblock_cache_end( h );
-    x264_free( h->out.p_bitstream );
-    for( i = 1; i < h->param.i_threads; i++ )
+    for( i = h->param.i_threads - 1; i >= 0; i-- )
+    {
+        x264_macroblock_cache_end( h->thread[i] );
+        x264_free( h->thread[i]->out.p_bitstream );
          x264_free( h->thread[i] );
-    x264_free( h );
+    }
  }
diff --git a/encoder/me.c b/encoder/me.c

index b31b32b816b6fed0a1d6d6f1ac8e178bce632271..60edc7941a27e18c0ff26716d4389bfc985772c0 100644 (file)
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -170,16 +170,11 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int
      int mv_x_max = h->mb.mv_max_fpel[0];
      int mv_y_max = h->mb.mv_max_fpel[1];
  
+#define CHECK_MVRANGE(mx,my) ( mx >= mv_x_min && mx <= mv_x_max && my >= mv_y_min && my <= mv_y_max )
+
      const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
      const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
  
-    if( h->mb.i_me_method == X264_ME_UMH )
-    {
-        /* clamp mvp to inside frame+padding, so that we don't have to check it each iteration */
-        p_cost_mvx = m->p_cost_mv - x264_clip3( m->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
-        p_cost_mvy = m->p_cost_mv - x264_clip3( m->mvp[1], h->mb.mv_min_spel[1], h->mb.mv_max_spel[1] );
-    }
-
      bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 );
      bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 );
      pmx = ( bmx + 2 ) >> 2;
@@ -219,11 +214,6 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int
      
      COST_MV( 0, 0 );
  
-    mv_x_max += 8;
-    mv_y_max += 8;
-    mv_x_min -= 8;
-    mv_y_min -= 8;
-
      switch( h->mb.i_me_method )
      {
      case X264_ME_DIA:
@@ -233,6 +223,8 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int
              DIA1_ITER( bmx, bmy );
              if( bmx == omx && bmy == omy )
                  break;
+            if( !CHECK_MVRANGE(bmx, bmy) )
+                break;
          }
          break;
  
@@ -251,6 +243,8 @@ me_hex2:
              COST_MV( omx-1, omy-2 );
              if( bmx == omx && bmy == omy )
                  break;
+            if( !CHECK_MVRANGE(bmx, bmy) )
+                break;
          }
  #else
          /* equivalent to the above, but eliminates duplicate candidates */
@@ -272,7 +266,7 @@ me_hex2:
              bmx += hex2[dir+1][0];
              bmy += hex2[dir+1][1];
              /* half hexagon, not overlapping the previous iteration */
-            for( i = 1; i < i_me_range/2; i++ )
+            for( i = 1; i < i_me_range/2 && CHECK_MVRANGE(bmx, bmy); i++ )
              {
                  static const int mod6[8] = {5,0,1,2,3,4,5,0};
                  const int odir = mod6[dir+1];
@@ -430,8 +424,7 @@ me_hex2:
                      {
                          int mx = omx + hex4[j][0]*i;
                          int my = omy + hex4[j][1]*i;
-                        if(    mx >= mv_x_min && mx <= mv_x_max
-                            && my >= mv_y_min && my <= mv_y_max )
+                        if( CHECK_MVRANGE(mx, my) )
                              COST_MV( mx, my );
                      }
                  }
@@ -525,7 +518,7 @@ me_hex2:
      m->cost_mv = p_cost_mvx[ m->mv[0] ] + p_cost_mvy[ m->mv[1] ];
      if( bmx == pmx && bmy == pmy && h->mb.i_subpel_refine < 3 )
          m->cost += m->cost_mv;
-    
+
      /* subpel refine */
      if( h->mb.i_subpel_refine >= 2 )
      {
@@ -533,6 +526,8 @@ me_hex2:
          int qpel = subpel_iterations[h->mb.i_subpel_refine][3];
          refine_subpel( h, m, hpel, qpel, p_halfpel_thresh, 0 );
      }
+    else if( m->mv[1] > h->mb.mv_max_spel[1] )
+        m->mv[1] = h->mb.mv_max_spel[1];
  }
  #undef COST_MV
  
@@ -640,6 +635,9 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
  
      if( !b_refine_qpel )
      {
+        /* check for mvrange */
+        if( bmy > h->mb.mv_max_spel[1] )
+            bmy = h->mb.mv_max_spel[1];
          bcost = COST_MAX;
          COST_MV_SATD( bmx, bmy, -1 );
      }
@@ -674,6 +672,14 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
              break;
      }
  
+    /* check for mvrange */
+    if( bmy > h->mb.mv_max_spel[1] )
+    {
+        bmy = h->mb.mv_max_spel[1];
+        bcost = COST_MAX;
+        COST_MV_SATD( bmx, bmy, -1 );
+    }
+
      m->cost = bcost;
      m->mv[0] = bmx;
      m->mv[1] = bmy;
@@ -754,6 +760,10 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight
      BIME_CACHE( 0, 0 );
      CHECK_BIDIR( 0, 0, 0, 0 );
  
+    if( bm0y > h->mb.mv_max_spel[1] - 8 ||
+        bm1y > h->mb.mv_max_spel[1] - 8 )
+        return bcost;
+
      for( pass = 0; pass < 8; pass++ )
      {
          /* check all mv pairs that differ in at most 2 components from the current mvs. */
@@ -873,6 +883,8 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8 )
      bdir = -1;
      for( i = 0; i < 2; i++ )
      {
+         if( bmy > h->mb.mv_max_spel[1] - 2 )
+             break;
           omx = bmx;
           omy = bmy;
           odir = bdir;
@@ -892,6 +904,8 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8 )
      bdir = -1;
      for( i = 0; i < 2; i++ )
      {
+         if( bmy > h->mb.mv_max_spel[1] - 1 )
+             break;
           omx = bmx;
           omy = bmy;
           odir = bdir;
@@ -907,6 +921,9 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8 )
              break;
      }
  
+    if( bmy > h->mb.mv_max_spel[1] )
+        bmy = h->mb.mv_max_spel[1];
+
      m->cost = bcost;
      m->mv[0] = bmx;
      m->mv[1] = bmy;
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c

index bca875bd169195a34ec0d4d85ab17f0cff52d945..dce7d331badb07a7b7b6f023e469481182fa24c5 100644 (file)
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -92,14 +92,14 @@ struct x264_ratecontrol_t
      int qp;                     /* qp for current frame */
      int qpm;                    /* qp for current macroblock */
      float qpa;                  /* average of macroblocks' qp */
-    int slice_type;
      int qp_force;
  
      /* VBV stuff */
      double buffer_size;
-    double buffer_fill;
+    double buffer_fill_final;   /* real buffer as of the last finished frame */
+    double buffer_fill;         /* planned buffer, if all in-progress frames hit their bit budget */
      double buffer_rate;         /* # of bits added to buffer_fill after each frame */
-    predictor_t pred[5];        /* predict frame size from satd */
+    predictor_t *pred;          /* predict frame size from satd */
  
      /* ABR stuff */
      int    last_satd;
@@ -136,10 +136,9 @@ struct x264_ratecontrol_t
  
      /* MBRC stuff */
      double frame_size_planned;
-    int first_row, last_row;    /* region of the frame to be encoded by this thread */
      predictor_t *row_pred;
      predictor_t row_preds[5];
-    predictor_t pred_b_from_p;  /* predict B-frame size from P-frame satd */
+    predictor_t *pred_b_from_p; /* predict B-frame size from P-frame satd */
      int bframes;                /* # consecutive B-frames before this P-frame */
      int bframe_bits;            /* total cost of those frames */
  
@@ -150,8 +149,9 @@ struct x264_ratecontrol_t
  
  static int parse_zones( x264_t *h );
  static int init_pass2(x264_t *);
-static float rate_estimate_qscale( x264_t *h, int pict_type );
+static float rate_estimate_qscale( x264_t *h );
  static void update_vbv( x264_t *h, int bits );
+static void update_vbv_plan( x264_t *h );
  static double predict_size( predictor_t *p, double q, double var );
  static void update_predictor( predictor_t *p, double q, double var, double bits );
  int  x264_rc_analyse_slice( x264_t *h );
@@ -190,7 +190,7 @@ int x264_ratecontrol_new( x264_t *h )
  
      x264_cpu_restore( h->param.cpu );
  
-    h->rc = rc = x264_malloc( h->param.i_threads * sizeof(x264_ratecontrol_t) );
+    rc = h->rc = x264_malloc( h->param.i_threads * sizeof(x264_ratecontrol_t) );
      memset( rc, 0, h->param.i_threads * sizeof(x264_ratecontrol_t) );
  
      rc->b_abr = h->param.rc.i_rc_method != X264_RC_CQP && !h->param.rc.b_stat_read;
@@ -237,7 +237,7 @@ int x264_ratecontrol_new( x264_t *h )
          }
          rc->buffer_rate = h->param.rc.i_vbv_max_bitrate * 1000 / rc->fps;
          rc->buffer_size = h->param.rc.i_vbv_buffer_size * 1000;
-        rc->buffer_fill = rc->buffer_size * h->param.rc.f_vbv_buffer_init;
+        rc->buffer_fill_final = rc->buffer_size * h->param.rc.f_vbv_buffer_init;
          rc->cbr_decay = 1.0 - rc->buffer_rate / rc->buffer_size
                        * 0.5 * X264_MAX(0, 1.5 - rc->buffer_rate * rc->fps / rc->bitrate);
          rc->b_vbv = 1;
@@ -285,6 +285,8 @@ int x264_ratecontrol_new( x264_t *h )
  
      rc->lstep = pow( 2, h->param.rc.i_qp_step / 6.0 );
      rc->last_qscale = qp2qscale(26);
+    rc->pred = x264_malloc( 5*sizeof(predictor_t) );
+    rc->pred_b_from_p = x264_malloc( sizeof(predictor_t) );
      for( i = 0; i < 5; i++ )
      {
          rc->last_qscale_for[i] = qp2qscale( ABR_INIT_QP );
@@ -297,7 +299,7 @@ int x264_ratecontrol_new( x264_t *h )
          rc->row_preds[i].count= 1.0;
          rc->row_preds[i].decay= 0.5;
      }
-    rc->pred_b_from_p = rc->pred[0];
+    *rc->pred_b_from_p = rc->pred[0];
  
      if( parse_zones( h ) < 0 )
          return -1;
@@ -464,6 +466,12 @@ int x264_ratecontrol_new( x264_t *h )
          x264_free( p );
      }
  
+    for( i=1; i<h->param.i_threads; i++ )
+    {
+        h->thread[i]->rc = rc+i;
+        rc[i] = rc[0];
+    }
+
      return 0;
  }
  
@@ -563,21 +571,35 @@ void x264_ratecontrol_delete( x264_t *h )
              }
          x264_free( rc->psz_stat_file_tmpname );
      }
+    x264_free( rc->pred );
+    x264_free( rc->pred_b_from_p );
      x264_free( rc->entry );
      x264_free( rc->zones );
      x264_free( rc );
  }
  
+static void accum_p_qp_update( x264_t *h, float qp )
+{
+    x264_ratecontrol_t *rc = h->rc;
+    rc->accum_p_qp   *= .95;
+    rc->accum_p_norm *= .95;
+    rc->accum_p_norm += 1;
+    if( h->sh.i_type == SLICE_TYPE_I )
+        rc->accum_p_qp += qp + rc->ip_offset;
+    else
+        rc->accum_p_qp += qp;
+}
+
  /* Before encoding a frame, choose a QP for it */
-void x264_ratecontrol_start( x264_t *h, int i_slice_type, int i_force_qp )
+void x264_ratecontrol_start( x264_t *h, int i_force_qp )
  {
      x264_ratecontrol_t *rc = h->rc;
      ratecontrol_entry_t *rce = NULL;
+    float q;
  
      x264_cpu_restore( h->param.cpu );
  
      rc->qp_force = i_force_qp;
-    rc->slice_type = i_slice_type;
  
      if( h->param.rc.b_stat_read )
      {
@@ -585,7 +607,7 @@ void x264_ratecontrol_start( x264_t *h, int i_slice_type, int i_force_qp )
          assert( frame >= 0 && frame < rc->num_entries );
          rce = h->rc->rce = &h->rc->entry[frame];
  
-        if( i_slice_type == SLICE_TYPE_B
+        if( h->sh.i_type == SLICE_TYPE_B
              && h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO )
          {
              h->sh.b_direct_spatial_mv_pred = ( rce->direct_mode == 's' );
@@ -593,14 +615,15 @@ void x264_ratecontrol_start( x264_t *h, int i_slice_type, int i_force_qp )
          }
      }
  
-    if( h->fdec->i_row_bits )
+    if( rc->b_vbv )
      {
          memset( h->fdec->i_row_bits, 0, h->sps->i_mb_height * sizeof(int) );
+        rc->row_pred = &rc->row_preds[h->sh.i_type];
+        update_vbv_plan( h );
      }
  
-    if( i_slice_type != SLICE_TYPE_B )
+    if( h->sh.i_type != SLICE_TYPE_B )
      {
-        rc->bframe_bits = 0;
          rc->bframes = 0;
          while( h->frames.current[rc->bframes] && IS_X264_TYPE_B(h->frames.current[rc->bframes]->i_type) )
              rc->bframes++;
@@ -610,27 +633,24 @@ void x264_ratecontrol_start( x264_t *h, int i_slice_type, int i_force_qp )
  
      if( i_force_qp )
      {
-        rc->qpm = rc->qp = i_force_qp - 1;
+        q = i_force_qp - 1;
      }
      else if( rc->b_abr )
      {
-        rc->qpm = rc->qp =
-            x264_clip3( (int)(qscale2qp( rate_estimate_qscale( h, i_slice_type ) ) + .5), 0, 51 );
+        q = qscale2qp( rate_estimate_qscale( h ) );
      }
      else if( rc->b_2pass )
      {
-        rce->new_qscale = rate_estimate_qscale( h, i_slice_type );
-        rc->qpm = rc->qp = rce->new_qp =
-            x264_clip3( (int)(qscale2qp(rce->new_qscale) + 0.5), 0, 51 );
+        rce->new_qscale = rate_estimate_qscale( h );
+        q = qscale2qp( rce->new_qscale );
      }
      else /* CQP */
      {
          x264_zone_t *zone = get_zone( h, h->fenc->i_frame );
-        float q;
-        if( i_slice_type == SLICE_TYPE_B && h->fdec->b_kept_as_ref )
+        if( h->sh.i_type == SLICE_TYPE_B && h->fdec->b_kept_as_ref )
              q = ( rc->qp_constant[ SLICE_TYPE_B ] + rc->qp_constant[ SLICE_TYPE_P ] ) / 2;
          else
-            q = rc->qp_constant[ i_slice_type ];
+            q = rc->qp_constant[ h->sh.i_type ];
  
          if( zone )
          {
@@ -639,9 +659,23 @@ void x264_ratecontrol_start( x264_t *h, int i_slice_type, int i_force_qp )
              else
                  q -= 6*log(zone->f_bitrate_factor)/log(2);
          }
-
-        rc->qpm = rc->qp = (int)(q + 0.5);
      }
+
+    h->fdec->f_qp_avg =
+    rc->qpm =
+    rc->qp = x264_clip3( (int)(q + 0.5), 0, 51 );
+    if( rce )
+        rce->new_qp = rc->qp;
+
+    /* accum_p_qp needs to be here so that future frames can benefit from the
+     * data before this frame is done. but this only works because threading
+     * guarantees to not re-encode any frames. so the non-threaded case does
+     * accum_p_qp later. */
+    if( h->param.i_threads > 1 )
+        accum_p_qp_update( h, rc->qp );
+
+    if( h->sh.i_type != SLICE_TYPE_B )
+        rc->last_non_b_pict_type = h->sh.i_type;
  }
  
  double predict_row_size( x264_t *h, int y, int qp )
@@ -651,7 +685,7 @@ double predict_row_size( x264_t *h, int y, int qp )
      x264_ratecontrol_t *rc = h->rc;
      double pred_s = predict_size( rc->row_pred, qp2qscale(qp), h->fdec->i_row_satd[y] );
      double pred_t = 0;
-    if( rc->slice_type != SLICE_TYPE_I 
+    if( h->sh.i_type != SLICE_TYPE_I 
          && h->fref0[0]->i_type == h->fdec->i_type
          && h->fref0[0]->i_row_satd[y] > 0 )
      {
@@ -668,9 +702,9 @@ double predict_row_size_sum( x264_t *h, int y, int qp )
  {
      int i;
      double bits = 0;
-    for( i = h->rc->first_row; i <= y; i++ )
+    for( i = 0; i <= y; i++ )
          bits += h->fdec->i_row_bits[i];
-    for( i = y+1; i <= h->rc->last_row; i++ )
+    for( i = y+1; i < h->sps->i_mb_height; i++ )
          bits += predict_row_size( h, i, qp );
      return bits;
  }
@@ -690,10 +724,10 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
  
      h->fdec->i_row_qp[y] = rc->qpm;
  
-    if( rc->slice_type == SLICE_TYPE_B )
+    if( h->sh.i_type == SLICE_TYPE_B )
      {
          /* B-frames shouldn't use lower QP than their reference frames */
-        if( y < rc->last_row )
+        if( y < h->sps->i_mb_height-1 )
          {
              rc->qpm = X264_MAX( rc->qp,
                        X264_MIN( h->fref0[0]->i_row_qp[y+1],
@@ -705,7 +739,7 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
          update_predictor( rc->row_pred, qp2qscale(rc->qpm), h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] );
  
          /* tweak quality based on difference from predicted size */
-        if( y < rc->last_row && h->stat.i_slice_count[rc->slice_type] > 0 )
+        if( y < h->sps->i_mb_height-1 && h->stat.i_slice_count[h->sh.i_type] > 0 )
          {
              int prev_row_qp = h->fdec->i_row_qp[y];
              int b0 = predict_row_size_sum( h, y, rc->qpm );
@@ -809,19 +843,15 @@ void x264_ratecontrol_end( x264_t *h, int bits )
          h->stat.frame.i_mb_count_p += mbs[i];
  
      if( h->mb.b_variable_qp )
-    {
-        for( i = 1; i < h->param.i_threads; i++ )
-            rc->qpa += rc[i].qpa;
          rc->qpa /= h->mb.i_mb_count;
-    }
      else
          rc->qpa = rc->qp;
      h->fdec->f_qp_avg = rc->qpa;
  
      if( h->param.rc.b_stat_write )
      {
-        char c_type = rc->slice_type==SLICE_TYPE_I ? (h->fenc->i_poc==0 ? 'I' : 'i')
-                    : rc->slice_type==SLICE_TYPE_P ? 'P'
+        char c_type = h->sh.i_type==SLICE_TYPE_I ? (h->fenc->i_poc==0 ? 'I' : 'i')
+                    : h->sh.i_type==SLICE_TYPE_P ? 'P'
                      : h->fenc->b_kept_as_ref ? 'B' : 'b';
          int dir_frame = h->stat.frame.i_direct_score[1] - h->stat.frame.i_direct_score[0];
          int dir_avg = h->stat.i_direct_score[1] - h->stat.i_direct_score[0];
@@ -843,7 +873,7 @@ void x264_ratecontrol_end( x264_t *h, int bits )
  
      if( rc->b_abr )
      {
-        if( rc->slice_type != SLICE_TYPE_B )
+        if( h->sh.i_type != SLICE_TYPE_B )
              rc->cplxr_sum += bits * qp2qscale(rc->qpa) / rc->last_rceq;
          else
          {
@@ -855,13 +885,8 @@ void x264_ratecontrol_end( x264_t *h, int bits )
          rc->wanted_bits_window += rc->bitrate / rc->fps;
          rc->wanted_bits_window *= rc->cbr_decay;
  
-        rc->accum_p_qp   *= .95;
-        rc->accum_p_norm *= .95;
-        rc->accum_p_norm += 1;
-        if( rc->slice_type == SLICE_TYPE_I )
-            rc->accum_p_qp += rc->qpa * fabs(h->param.rc.f_ip_factor);
-        else
-            rc->accum_p_qp += rc->qpa;
+        if( h->param.i_threads == 1 )
+            accum_p_qp_update( h, rc->qpa );
      }
  
      if( rc->b_2pass )
@@ -871,26 +896,19 @@ void x264_ratecontrol_end( x264_t *h, int bits )
  
      if( h->mb.b_variable_qp )
      {
-        if( rc->slice_type == SLICE_TYPE_B )
+        if( h->sh.i_type == SLICE_TYPE_B )
          {
              rc->bframe_bits += bits;
              if( !h->frames.current[0] || !IS_X264_TYPE_B(h->frames.current[0]->i_type) )
-                update_predictor( &rc->pred_b_from_p, qp2qscale(rc->qpa), h->fref1[0]->i_satd, rc->bframe_bits / rc->bframes );
-        }
-        else
-        {
-            /* Update row predictor based on data collected by other threads. */
-            int y;
-            for( y = rc->last_row+1; y < h->sps->i_mb_height; y++ )
-                update_predictor( rc->row_pred, qp2qscale(h->fdec->i_row_qp[y]), h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] );
-            rc->row_preds[rc->slice_type] = *rc->row_pred;
+            {
+                update_predictor( rc->pred_b_from_p, qp2qscale(rc->qpa),
+                                  h->fref1[h->i_ref1-1]->i_satd, rc->bframe_bits / rc->bframes );
+                rc->bframe_bits = 0;
+            }
          }
      }
  
      update_vbv( h, bits );
-
-    if( rc->slice_type != SLICE_TYPE_B )
-        rc->last_non_b_pict_type = rc->slice_type;
  }
  
  /****************************************************************************
@@ -1067,20 +1085,43 @@ static void update_predictor( predictor_t *p, double q, double var, double bits
      p->coeff += bits*q / var;
  }
  
+// update VBV after encoding a frame
  static void update_vbv( x264_t *h, int bits )
  {
      x264_ratecontrol_t *rcc = h->rc;
+    x264_ratecontrol_t *rct = h->thread[0]->rc;
  
      if( rcc->last_satd >= h->mb.i_mb_count )
-        update_predictor( &rcc->pred[rcc->slice_type], qp2qscale(rcc->qpa), rcc->last_satd, bits );
+        update_predictor( &rct->pred[h->sh.i_type], qp2qscale(rcc->qpa), rcc->last_satd, bits );
  
      if( !rcc->b_vbv )
          return;
  
-    rcc->buffer_fill += rcc->buffer_rate - bits;
-    if( rcc->buffer_fill < 0 && !rcc->b_2pass )
-        x264_log( h, X264_LOG_WARNING, "VBV underflow (%.0f bits)\n", rcc->buffer_fill );
-    rcc->buffer_fill = x264_clip3( rcc->buffer_fill, 0, rcc->buffer_size );
+    rct->buffer_fill_final += rct->buffer_rate - bits;
+    if( rct->buffer_fill_final < 0 && !rct->b_2pass )
+        x264_log( h, X264_LOG_WARNING, "VBV underflow (%.0f bits)\n", rct->buffer_fill_final );
+    rct->buffer_fill_final = x264_clip3( rct->buffer_fill_final, 0, rct->buffer_size );
+}
+
+// provisionally update VBV according to the planned size of all frames currently in progress
+static void update_vbv_plan( x264_t *h )
+{
+    x264_ratecontrol_t *rcc = h->rc;
+    rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final;
+    if( h->param.i_threads > 1 )
+    {
+        int j = h->rc - h->thread[0]->rc;
+        int i;
+        for( i=1; i<h->param.i_threads; i++ )
+        {
+            x264_t *t = h->thread[ (j+i)%h->param.i_threads ];
+            double bits = t->rc->frame_size_planned;
+            if( !t->b_thread_active )
+                continue;
+            rcc->buffer_fill += rcc->buffer_rate - bits;
+            rcc->buffer_fill = x264_clip3( rcc->buffer_fill, 0, rcc->buffer_size );
+        }
+    }
  }
  
  // apply VBV constraints and clip qscale to between lmin and lmax
@@ -1107,7 +1148,7 @@ static double clip_qscale( x264_t *h, int pict_type, double q )
      {
          /* Now a hard threshold to make sure the frame fits in VBV.
           * This one is mostly for I-frames. */
-        double bits = predict_size( &rcc->pred[rcc->slice_type], q, rcc->last_satd );
+        double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
          double qf = 1.0;
          if( bits > rcc->buffer_fill/2 )
              qf = x264_clip3f( rcc->buffer_fill/(2*bits), 0.2, 1.0 );
@@ -1119,11 +1160,11 @@ static double clip_qscale( x264_t *h, int pict_type, double q )
  
          /* Check B-frame complexity, and use up any bits that would
           * overflow before the next P-frame. */
-        if( rcc->slice_type == SLICE_TYPE_P )
+        if( h->sh.i_type == SLICE_TYPE_P )
          {
              int nb = rcc->bframes;
              double pbbits = bits;
-            double bbits = predict_size( &rcc->pred_b_from_p, q * h->param.rc.f_pb_factor, rcc->last_satd );
+            double bbits = predict_size( rcc->pred_b_from_p, q * h->param.rc.f_pb_factor, rcc->last_satd );
              double space;
  
              if( bbits > rcc->buffer_rate )
@@ -1159,11 +1200,12 @@ static double clip_qscale( x264_t *h, int pict_type, double q )
  }
  
  // update qscale for 1 frame based on actual bits used so far
-static float rate_estimate_qscale(x264_t *h, int pict_type)
+static float rate_estimate_qscale( x264_t *h )
  {
      float q;
      x264_ratecontrol_t *rcc = h->rc;
      ratecontrol_entry_t rce;
+    int pict_type = h->sh.i_type;
      double lmin = rcc->lmin[pict_type];
      double lmax = rcc->lmax[pict_type];
      int64_t total_bits = 8*(h->stat.i_slice_size[SLICE_TYPE_I]
@@ -1211,6 +1253,7 @@ static float rate_estimate_qscale(x264_t *h, int pict_type)
          else
              q += rcc->pb_offset;
  
+        rcc->frame_size_planned = predict_size( rcc->pred_b_from_p, q, h->fref1[h->i_ref1-1]->i_satd );
          rcc->last_satd = 0;
          return qp2qscale(q);
      }
@@ -1245,7 +1288,7 @@ static float rate_estimate_qscale(x264_t *h, int pict_type)
               * tradeoff between quality and bitrate precision. But at large
               * tolerances, the bit distribution approaches that of 2pass. */
  
-            double wanted_bits, overflow, lmin, lmax;
+            double wanted_bits, overflow=1, lmin, lmax;
  
              rcc->last_satd = x264_rc_analyse_slice( h );
              rcc->short_term_cplxsum *= 0.5;
@@ -1266,16 +1309,21 @@ static float rate_estimate_qscale(x264_t *h, int pict_type)
              if( h->param.rc.i_rc_method == X264_RC_CRF )
              {
                  q = get_qscale( h, &rce, rcc->rate_factor_constant, h->fenc->i_frame );
-                overflow = 1;
              }
              else
              {
+                int i_frame_done = h->fenc->i_frame + 1 - h->param.i_threads;
+
                  q = get_qscale( h, &rce, rcc->wanted_bits_window / rcc->cplxr_sum, h->fenc->i_frame );
  
-                wanted_bits = h->fenc->i_frame * rcc->bitrate / rcc->fps;
-                abr_buffer *= X264_MAX( 1, sqrt(h->fenc->i_frame/25) );
-                overflow = x264_clip3f( 1.0 + (total_bits - wanted_bits) / abr_buffer, .5, 2 );
-                q *= overflow;
+                // FIXME is it simpler to keep track of wanted_bits in ratecontrol_end?
+                wanted_bits = i_frame_done * rcc->bitrate / rcc->fps;
+                if( wanted_bits > 0 )
+                {
+                    abr_buffer *= X264_MAX( 1, sqrt(i_frame_done/25) );
+                    overflow = x264_clip3f( 1.0 + (total_bits - wanted_bits) / abr_buffer, .5, 2 );
+                    q *= overflow;
+                }
              }
  
              if( pict_type == SLICE_TYPE_I && h->param.i_keyint_max > 1
@@ -1313,45 +1361,46 @@ static float rate_estimate_qscale(x264_t *h, int pict_type)
          if( !rcc->b_2pass && h->fenc->i_frame == 0 )
              rcc->last_qscale_for[SLICE_TYPE_P] = q;
  
-        rcc->frame_size_planned = predict_size( &rcc->pred[rcc->slice_type], q, rcc->last_satd );
-
+        rcc->frame_size_planned = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
          return q;
      }
  }
  
-/* Distribute bits among the slices, proportional to their estimated complexity */
-void x264_ratecontrol_threads_start( x264_t *h )
+void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
  {
-    x264_ratecontrol_t *rc = h->rc;
-    int t, y;
-    double den = 0;
-    double frame_size_planned = rc->frame_size_planned;
-
-    for( t = 0; t < h->param.i_threads; t++ )
+    if( cur != prev )
      {
-        h->thread[t]->rc = &rc[t];
-        if( t > 0 )
-            rc[t] = rc[0];
+#define COPY(var) memcpy(&cur->rc->var, &prev->rc->var, sizeof(cur->rc->var))
+        /* these vars are updated in x264_ratecontrol_start()
+         * so copy them from the context that most recently started (prev)
+         * to the context that's about to start (cur).
+         */
+        COPY(accum_p_qp);
+        COPY(accum_p_norm);
+        COPY(last_satd);
+        COPY(last_rceq);
+        COPY(last_qscale_for);
+        COPY(last_non_b_pict_type);
+        COPY(short_term_cplxsum);
+        COPY(short_term_cplxcount);
+        COPY(bframes);
+#undef COPY
      }
-
-    if( !h->mb.b_variable_qp || rc->slice_type == SLICE_TYPE_B )
-        return;
-
-    for( t = 0; t < h->param.i_threads; t++ )
+    if( cur != next )
      {
-        rc[t].first_row = h->thread[t]->sh.i_first_mb / h->sps->i_mb_width;
-        rc[t].last_row = (h->thread[t]->sh.i_last_mb-1) / h->sps->i_mb_width;
-        rc[t].frame_size_planned = 1;
-        rc[t].row_pred = &rc[t].row_preds[rc->slice_type];
-        if( h->param.i_threads > 1 )
-        {
-            for( y = rc[t].first_row; y<= rc[t].last_row; y++ )
-                rc[t].frame_size_planned += predict_row_size( h, y, qscale2qp(rc[t].qp) );
-        }
-        den += rc[t].frame_size_planned;
+#define COPY(var) next->rc->var = cur->rc->var
+        /* these vars are updated in x264_ratecontrol_end()
+         * so copy them from the context that most recently ended (cur)
+         * to the context that's about to end (next)
+         */
+        COPY(cplxr_sum);
+        COPY(expected_bits_sum);
+        COPY(wanted_bits_window);
+        COPY(bframe_bits);
+#undef COPY
      }
-    for( t = 0; t < h->param.i_threads; t++ )
-        rc[t].frame_size_planned *= frame_size_planned / den;
+    //FIXME row_preds[] (not strictly necessary, but would improve prediction)
+    /* the rest of the variables are either constant or thread-local */
  }
  
  static int init_pass2( x264_t *h )
@@ -1487,6 +1536,7 @@ static int init_pass2( x264_t *h )
              rce->expected_bits = expected_bits;
              expected_bits += bits;
              update_vbv(h, bits);
+            rcc->buffer_fill = rcc->buffer_fill_final;
          }
  
  //printf("expected:%llu available:%llu factor:%lf avgQ:%lf\n", (uint64_t)expected_bits, all_available_bits, rate_factor);
diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h

index a18c4922bfabbf770520e96219ab409df2f08c51..d4af2c05c707cccbec0b28a2f225997e7b4ef952 100644 (file)
--- a/encoder/ratecontrol.h
+++ b/encoder/ratecontrol.h
@@ -27,8 +27,8 @@
  int  x264_ratecontrol_new   ( x264_t * );
  void x264_ratecontrol_delete( x264_t * );
  
-void x264_ratecontrol_start( x264_t *, int i_slice_type, int i_force_qp );
-void x264_ratecontrol_threads_start( x264_t * );
+void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
+void x264_ratecontrol_start( x264_t *, int i_force_qp );
  int  x264_ratecontrol_slice_type( x264_t *, int i_frame );
  void x264_ratecontrol_mb( x264_t *, int bits );
  int  x264_ratecontrol_qp( x264_t * );
diff --git a/encoder/slicetype.c b/encoder/slicetype.c

index dd1c577021a4d0af682cceeedfb6534d6e7aa928..c8685b92f4d0c73e9a84e15f133a20a68b21192f 100644 (file)
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -157,36 +157,27 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
          int mvc[4][2] = {{0}}, i_mvc;
          int16_t (*fenc_mv)[2] = &fenc->mv[l][i_mb_xy];
          i_mvc = 0;
+#define MVC(mv) { mvc[i_mvc][0] = mv[0]; mvc[i_mvc][1] = mv[1]; i_mvc++; }
          if( i_mb_x > 0 )
-        {
-            mvc[i_mvc][0] = fenc_mv[-1][0];
-            mvc[i_mvc][1] = fenc_mv[-1][1];
-            i_mvc++;
-        }
+            MVC(fenc_mv[-1]);
          if( i_mb_y > 0 )
          {
-            mvc[i_mvc][0] = fenc_mv[-i_mb_stride][0];
-            mvc[i_mvc][1] = fenc_mv[-i_mb_stride][1];
-            i_mvc++;
+            MVC(fenc_mv[-i_mb_stride]);
              if( i_mb_x < h->sps->i_mb_width - 1 )
-            {
-                mvc[i_mvc][0] = fenc_mv[-i_mb_stride+1][0];
-                mvc[i_mvc][1] = fenc_mv[-i_mb_stride+1][1];
-                i_mvc++;
-            }
+                MVC(fenc_mv[-i_mb_stride+1]);
              if( i_mb_x > 0 )
-            {
-                mvc[i_mvc][0] = fenc_mv[-i_mb_stride-1][0];
-                mvc[i_mvc][1] = fenc_mv[-i_mb_stride-1][1];
-                i_mvc++;
-            }
+                MVC(fenc_mv[-i_mb_stride-1]);
          }
+#undef MVC
          m[l].mvp[0] = x264_median( mvc[0][0], mvc[1][0], mvc[2][0] );
          m[l].mvp[1] = x264_median( mvc[0][1], mvc[1][1], mvc[2][1] );
  
          x264_me_search( h, &m[l], mvc, i_mvc );
  
-        i_bcost = X264_MIN( i_bcost, m[l].cost + 3 );
+        m[l].cost -= 2; // remove mvcost from skip mbs
+        if( m[l].mv[0] || m[l].mv[1] )
+            m[l].cost += 5;
+        i_bcost = X264_MIN( i_bcost, m[l].cost );
      }
  
      if( b_bidir && (m[0].mv[0] || m[0].mv[1] || m[1].mv[0] || m[1].mv[1]) )
@@ -195,14 +186,16 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
      if( i_bcost < i_cost_bak )
          SAVE_MVS( m[0].mv, m[1].mv );
  
+    //FIXME intra part could be shared across multiple encodings of the frame
  lowres_intra_mb:
+    if( !b_bidir ) // forbid intra-mbs in B-frames, because it's rare and not worth checking
      {
          uint8_t *pix = &pix1[8+FDEC_STRIDE - 1];
          uint8_t *src = &fenc->lowres[0][i_pel_offset - 1];
-        int intra_penalty = 5 + 10 * b_bidir;
-        int satds[4], i_icost;
+        const int intra_penalty = 5;
+        int satds[4], i_icost, b_intra;
  
-        memcpy( pix-FDEC_STRIDE, src-i_stride, 9 );
+        memcpy( pix-FDEC_STRIDE, src-i_stride, 17 );
          for( i=0; i<8; i++ )
              pix[i*FDEC_STRIDE] = src[i*i_stride];
          pix++;
@@ -222,18 +215,30 @@ lowres_intra_mb:
                  satds[i] = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
              }
          }
-        i_icost = X264_MIN4( satds[0], satds[1], satds[2], satds[3] ) + intra_penalty;
-        if( i_icost < i_bcost )
+        i_icost = X264_MIN4( satds[0], satds[1], satds[2], satds[3] );
+
+        if( i_icost < i_bcost * 2 )
          {
-            i_bcost = i_icost;
-            if( !b_bidir
-                && i_mb_x > 0 && i_mb_x < h->sps->i_mb_width - 1
-                && i_mb_y > 0 && i_mb_y < h->sps->i_mb_height - 1 )
+            DECLARE_ALIGNED( uint8_t, edge[33], 8 );
+            x264_predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
+            for( i=3; i<9; i++ )
              {
-                fenc->i_intra_mbs[b-p0]++;
+                int satd;
+                h->predict_8x8[i]( pix, edge );
+                satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
+                i_icost = X264_MIN( i_icost, satd );
              }
-            if( p1 > p0+1 )
-                i_bcost = i_bcost * 9 / 8; // arbitrary penalty for I-blocks in and after B-frames
+        }
+
+        i_icost += intra_penalty;
+        b_intra = i_icost < i_bcost;
+        if( b_intra )
+            i_bcost = i_icost;
+        if(    i_mb_x > 0 && i_mb_x < h->sps->i_mb_width - 1
+            && i_mb_y > 0 && i_mb_y < h->sps->i_mb_height - 1 )
+        {
+            fenc->i_intra_mbs[b-p0] += b_intra;
+            fenc->i_cost_est[0][0] += i_icost;
          }
      }
  
@@ -243,63 +248,118 @@ lowres_intra_mb:
  #undef SAVE_MVS
  
  int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
-                               x264_frame_t **frames, int p0, int p1, int b )
+                               x264_frame_t **frames, int p0, int p1, int b,
+                               int b_intra_penalty )
  {
      int i_score = 0;
-    int dist_scale_factor = 128;
-    int *row_satd = frames[b]->i_row_satds[b-p0][p1-b];
  
      /* Check whether we already evaluated this frame
       * If we have tried this frame as P, then we have also tried
       * the preceding frames as B. (is this still true?) */
      if( frames[b]->i_cost_est[b-p0][p1-b] >= 0 )
-        return frames[b]->i_cost_est[b-p0][p1-b];
-
-    /* Init MVs so that we don't have to check edge conditions when loading predictors. */
-    /* FIXME: not needed every time */
-    memset( frames[b]->mv[0], 0, h->sps->i_mb_height * h->sps->i_mb_width * 2*sizeof(int16_t) );
-    if( b != p1 )
-        memset( frames[b]->mv[1], 0, h->sps->i_mb_height * h->sps->i_mb_width * 2*sizeof(int16_t) );
-
-    if( b == p1 )
-        frames[b]->i_intra_mbs[b-p0] = 0;
-    if( p1 != p0 )
-        dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
-
-    /* the edge mbs seem to reduce the predictive quality of the
-     * whole frame's score, but are needed for a spatial distribution. */
-    if( h->param.rc.i_vbv_buffer_size )
      {
-        for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->sps->i_mb_height; h->mb.i_mb_y++ )
+        i_score = frames[b]->i_cost_est[b-p0][p1-b];
+    }
+    else
+    {
+        int dist_scale_factor = 128;
+        int *row_satd = frames[b]->i_row_satds[b-p0][p1-b];
+
+        /* Init MVs so that we don't have to check edge conditions when loading predictors. */
+        /* FIXME: not needed every time */
+        memset( frames[b]->mv[0], 0, h->sps->i_mb_height * h->sps->i_mb_width * 2*sizeof(int16_t) );
+        if( b != p1 )
+            memset( frames[b]->mv[1], 0, h->sps->i_mb_height * h->sps->i_mb_width * 2*sizeof(int16_t) );
+
+        if( b == p1 )
          {
-            row_satd[ h->mb.i_mb_y ] = 0;
-            for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->sps->i_mb_width; h->mb.i_mb_x++ )
+            frames[b]->i_intra_mbs[b-p0] = 0;
+            frames[b]->i_cost_est[0][0] = 0;
+        }
+        if( p1 != p0 )
+            dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
+
+        /* the edge mbs seem to reduce the predictive quality of the
+         * whole frame's score, but are needed for a spatial distribution. */
+        if( h->param.rc.i_vbv_buffer_size )
+        {
+            for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->sps->i_mb_height; h->mb.i_mb_y++ )
              {
-                int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor );
-                row_satd[ h->mb.i_mb_y ] += i_mb_cost;
-                if( h->mb.i_mb_y > 0 && h->mb.i_mb_y < h->sps->i_mb_height - 1 &&
-                    h->mb.i_mb_x > 0 && h->mb.i_mb_x < h->sps->i_mb_width - 1 )
+                row_satd[ h->mb.i_mb_y ] = 0;
+                for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->sps->i_mb_width; h->mb.i_mb_x++ )
                  {
-                    i_score += i_mb_cost;
+                    int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor );
+                    row_satd[ h->mb.i_mb_y ] += i_mb_cost;
+                    if( h->mb.i_mb_y > 0 && h->mb.i_mb_y < h->sps->i_mb_height - 1 &&
+                        h->mb.i_mb_x > 0 && h->mb.i_mb_x < h->sps->i_mb_width - 1 )
+                    {
+                        i_score += i_mb_cost;
+                    }
                  }
              }
          }
+        else
+        {
+            for( h->mb.i_mb_y = 1; h->mb.i_mb_y < h->sps->i_mb_height - 1; h->mb.i_mb_y++ )
+                for( h->mb.i_mb_x = 1; h->mb.i_mb_x < h->sps->i_mb_width - 1; h->mb.i_mb_x++ )
+                    i_score += x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor );
+        }
+
+        if( b != p1 )
+            i_score = i_score * 100 / (120 + h->param.i_bframe_bias);
+
+        frames[b]->i_cost_est[b-p0][p1-b] = i_score;
+//      fprintf( stderr, "frm %d %c(%d,%d): %6d %6d imb:%d  \n", frames[b]->i_frame,
+//               (p1==0?'I':b<p1?'B':'P'), b-p0, p1-b, i_score, frames[b]->i_cost_est[0][0], frames[b]->i_intra_mbs[b-p0] );
+        x264_cpu_restore( h->param.cpu );
      }
-    else
+
+    if( b_intra_penalty )
      {
-        for( h->mb.i_mb_y = 1; h->mb.i_mb_y < h->sps->i_mb_height - 1; h->mb.i_mb_y++ )
-            for( h->mb.i_mb_x = 1; h->mb.i_mb_x < h->sps->i_mb_width - 1; h->mb.i_mb_x++ )
-                i_score += x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor );
+        // arbitrary penalty for I-blocks after B-frames
+        int nmb = (h->sps->i_mb_width - 2) * (h->sps->i_mb_height - 2);
+        i_score += i_score * frames[b]->i_intra_mbs[b-p0] / (nmb * 8);
      }
+    return i_score;
+}
  
-    if( b != p1 )
-        i_score = i_score * 100 / (120 + h->param.i_bframe_bias);
+static int scenecut( x264_t *h, x264_frame_t *frame, int pdist )
+{
+    int icost = frame->i_cost_est[0][0];
+    int pcost = frame->i_cost_est[pdist][0];
+    float f_bias;
+    int i_gop_size = frame->i_frame - h->frames.i_last_idr;
+    float f_thresh_max = h->param.i_scenecut_threshold / 100.0;
+    /* magic numbers pulled out of thin air */
+    float f_thresh_min = f_thresh_max * h->param.i_keyint_min
+                         / ( h->param.i_keyint_max * 4 );
+    int res;
+
+    if( h->param.i_keyint_min == h->param.i_keyint_max )
+        f_thresh_min= f_thresh_max;
+    if( i_gop_size < h->param.i_keyint_min / 4 )
+        f_bias = f_thresh_min / 4;
+    else if( i_gop_size <= h->param.i_keyint_min )
+        f_bias = f_thresh_min * i_gop_size / h->param.i_keyint_min;
+    else
+    {
+        f_bias = f_thresh_min
+                 + ( f_thresh_max - f_thresh_min )
+                   * ( i_gop_size - h->param.i_keyint_min )
+                   / ( h->param.i_keyint_max - h->param.i_keyint_min );
+    }
  
-    frames[b]->i_cost_est[b-p0][p1-b] = i_score;
-//  fprintf( stderr, "frm %d %c(%d,%d): %6d I:%d  \n", frames[b]->i_frame,
-//           (p1==0?'I':b<p1?'B':'P'), b-p0, p1-b, i_score, frames[b]->i_intra_mbs[b-p0] );
-    x264_cpu_restore( h->param.cpu );
-    return i_score;
+    res = pcost >= (1.0 - f_bias) * icost;
+    if( res )
+    {
+        int imb = frame->i_intra_mbs[pdist];
+        int pmb = (h->sps->i_mb_width - 2) * (h->sps->i_mb_height - 2) - imb;
+        x264_log( h, X264_LOG_DEBUG, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n",
+                  frame->i_frame,
+                  icost, pcost, 1. - (double)pcost / icost,
+                  f_bias, i_gop_size, imb, pmb );
+    }
+    return res;
  }
  
  void x264_slicetype_analyse( x264_t *h )
@@ -311,6 +371,7 @@ void x264_slicetype_analyse( x264_t *h )
      int j;
      int i_mb_count = (h->sps->i_mb_width - 2) * (h->sps->i_mb_height - 2);
      int cost1p0, cost2p0, cost1b1, cost2p1;
+    int idr_frame_type;
  
      if( !h->frames.last_nonb )
          return;
@@ -321,22 +382,30 @@ void x264_slicetype_analyse( x264_t *h )
      num_frames = X264_MIN( j, keyint_limit );
      if( num_frames == 0 )
          return;
+
+    x264_lowres_context_init( h, &a );
+    idr_frame_type = frames[1]->i_frame - h->frames.i_last_idr >= h->param.i_keyint_min ? X264_TYPE_IDR : X264_TYPE_I;
+
      if( num_frames == 1 )
      {
  no_b_frames:
          frames[1]->i_type = X264_TYPE_P;
+        if( h->param.b_pre_scenecut && h->param.i_scenecut_threshold >= 0 )
+        {
+            x264_slicetype_frame_cost( h, &a, frames, 0, 1, 1, 0 );
+            if( scenecut( h, frames[1], 1 ) )
+                frames[1]->i_type = idr_frame_type;
+        }
          return;
      }
  
-    x264_lowres_context_init( h, &a );
-
-    cost2p1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 2 );
+    cost2p1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 2, 1 );
      if( frames[2]->i_intra_mbs[2] > i_mb_count / 2 )
          goto no_b_frames;
  
-    cost1b1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 1 );
-    cost1p0 = x264_slicetype_frame_cost( h, &a, frames, 0, 1, 1 );
-    cost2p0 = x264_slicetype_frame_cost( h, &a, frames, 1, 2, 2 );
+    cost1b1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 1, 0 );
+    cost1p0 = x264_slicetype_frame_cost( h, &a, frames, 0, 1, 1, 0 );
+    cost2p0 = x264_slicetype_frame_cost( h, &a, frames, 1, 2, 2, 0 );
  //  fprintf( stderr, "PP: %d + %d <=> BP: %d + %d \n",
  //           cost1p0, cost2p0, cost1b1, cost2p1 );
      if( cost1p0 + cost2p0 < cost1b1 + cost2p1 )
@@ -350,7 +419,7 @@ no_b_frames:
      for( j = 2; j <= X264_MIN( h->param.i_bframe, num_frames-1 ); j++ )
      {
          int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS * (j-1), INTER_THRESH/10);
-        int pcost = x264_slicetype_frame_cost( h, &a, frames, 0, j+1, j+1 );
+        int pcost = x264_slicetype_frame_cost( h, &a, frames, 0, j+1, j+1, 1 );
  //      fprintf( stderr, "frm%d+%d: %d <=> %d, I:%d/%d \n",
  //               frames[0]->i_frame, j-1, pthresh, pcost/i_mb_count,
  //               frames[j+1]->i_intra_mbs[j+1], i_mb_count );
@@ -456,7 +525,7 @@ int x264_rc_analyse_slice( x264_t *h )
      frames[p0] = h->fref0[0];
      frames[b] = h->fenc;
  
-    cost = x264_slicetype_frame_cost( h, &a, frames, p0, p1, b );
+    cost = x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 );
      h->fenc->i_row_satd = h->fenc->i_row_satds[b-p0][p1-b];
      h->fdec->i_row_satd = h->fdec->i_row_satds[b-p0][p1-b];
      h->fdec->i_satd = cost;
diff --git a/x264.c b/x264.c

index 74d5e2645a01b5d696aac1b79aac1f9fe82f57d9..cf194ef90704bf2254e2d946dbadf985651ddb3a 100644 (file)
--- a/x264.c
+++ b/x264.c
@@ -156,6 +156,8 @@ static void Help( x264_param_t *defaults, int b_longhelp )
      H0( "  -I, --keyint <integer>      Maximum GOP size [%d]\n", defaults->i_keyint_max );
      H1( "  -i, --min-keyint <integer>  Minimum GOP size [%d]\n", defaults->i_keyint_min );
      H1( "      --scenecut <integer>    How aggressively to insert extra I-frames [%d]\n", defaults->i_scenecut_threshold );
+    H1( "      --pre-scenecut          Faster, less precise scenecut detection.\n"
+        "                                  Required and implied by multi-threading.\n" );
      H0( "  -b, --bframes <integer>     Number of B-frames between I and P [%d]\n", defaults->i_bframe );
      H1( "      --no-b-adapt            Disable adaptive B-frame decision\n" );
      H1( "      --b-bias <integer>      Influences how often B-frames are used [%d]\n", defaults->i_bframe_bias );
@@ -223,6 +225,8 @@ static void Help( x264_param_t *defaults, int b_longhelp )
          "                                  - esa: exhaustive search (slow)\n" );
      else H0( "                                  - dia, hex, umh\n" );
      H0( "      --merange <integer>     Maximum motion vector search range [%d]\n", defaults->analyse.i_me_range );
+    H1( "      --mvrange <integer>     Maximum motion vector length [-1 (auto)]\n" );
+    H1( "      --mvrange-thread <int>  Minimum buffer between threads [-1 (auto)]\n" );
      H0( "  -m, --subme <integer>       Subpixel motion estimation and partition\n"
          "                                  decision quality: 1=fast, 7=best. [%d]\n", defaults->analyse.i_subpel_refine );
      H0( "      --b-rdo                 RD based mode decision for B-frames. Requires subme 6.\n" );
@@ -296,8 +300,9 @@ static void Help( x264_param_t *defaults, int b_longhelp )
      H0( "      --quiet                 Quiet Mode\n" );
      H0( "      --no-psnr               Disable PSNR computation\n" );
      H0( "      --no-ssim               Disable SSIM computation\n" );
-    H0( "      --threads <integer>     Parallel encoding (uses slices)\n" );
+    H0( "      --threads <integer>     Parallel encoding\n" );
      H0( "      --thread-input          Run Avisynth in its own thread\n" );
+    H1( "      --non-deterministic     Slightly improve quality of SMP, at the cost of repeatability\n" );
      H1( "      --no-asm                Disable all CPU optimizations\n" );
      H1( "      --visualize             Show MB types overlayed on the encoded video\n" );
      H1( "      --sps-id <integer>      Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id );
@@ -361,6 +366,7 @@ static int  Parse( int argc, char **argv,
              { "min-keyint",required_argument,NULL,'i' },
              { "keyint",  required_argument, NULL, 'I' },
              { "scenecut",required_argument, NULL, 0 },
+            { "pre-scenecut", no_argument,  NULL, 0 },
              { "nf",      no_argument,       NULL, 0 },
              { "no-deblock", no_argument,    NULL, 0 },
              { "filter",  required_argument, NULL, 0 },
@@ -386,6 +392,8 @@ static int  Parse( int argc, char **argv,
              { "weightb", no_argument,       NULL, 'w' },
              { "me",      required_argument, NULL, 0 },
              { "merange", required_argument, NULL, 0 },
+            { "mvrange", required_argument, NULL, 0 },
+            { "mvrange-thread", required_argument, NULL, 0 },
              { "subme",   required_argument, NULL, 'm' },
              { "b-rdo",   no_argument,       NULL, 0 },
              { "mixed-refs", no_argument,    NULL, 0 },
@@ -415,6 +423,7 @@ static int  Parse( int argc, char **argv,
              { "qpfile",  required_argument, NULL, OPT_QPFILE },
              { "threads", required_argument, NULL, 0 },
              { "thread-input", no_argument,  NULL, OPT_THREAD_INPUT },
+            { "non-deterministic", no_argument, NULL, 0 },
              { "no-psnr", no_argument,       NULL, 0 },
              { "no-ssim", no_argument,       NULL, 0 },
              { "quiet",   no_argument,       NULL, OPT_QUIET },
diff --git a/x264.h b/x264.h

index 491393f9a6570ac35f1b9ba23c4944d2085b51bf..d2897d8479b9b13c630d4e048d7ebb7fab07f697 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -132,7 +132,8 @@ typedef struct
  {
      /* CPU flags */
      unsigned int cpu;
-    int         i_threads;  /* divide each frame into multiple slices, encode in parallel */
+    int         i_threads;       /* encode multiple frames in parallel */
+    int         b_deterministic; /* whether to allow non-deterministic optimizations when threaded */
  
      /* Video Properties */
      int         i_width;
@@ -166,6 +167,7 @@ typedef struct
      int         i_keyint_max;       /* Force an IDR keyframe at this interval */
      int         i_keyint_min;       /* Scenecuts closer together than this are coded as I, not IDR. */
      int         i_scenecut_threshold; /* how aggressively to insert extra I frames */
+    int         b_pre_scenecut;     /* compute scenecut on lowres frames */
      int         i_bframe;   /* how many b-frame between 2 references pictures */
      int         b_bframe_adaptive;
      int         i_bframe_bias;
@@ -210,6 +212,7 @@ typedef struct
          int          i_me_method; /* motion estimation algorithm to use (X264_ME_*) */
          int          i_me_range; /* integer pixel motion estimation search range (from predicted mv) */
          int          i_mv_range; /* maximum length of a mv (in pixels). -1 = auto, based on level */
+        int          i_mv_range_thread; /* minimum space between threads. -1 = auto, based on number of threads. */
          int          i_subpel_refine; /* subpixel motion estimation quality */
          int          b_bidir_me; /* jointly optimize both MVs in B-frames */
          int          b_chroma_me; /* chroma ME for subpel and mode decision in P-frames */
author	Loren Merritt <pengvado@videolan.org>
	Fri, 15 Dec 2006 23:03:36 +0000 (23:03 +0000)
committer	Loren Merritt <pengvado@videolan.org>
	Fri, 15 Dec 2006 23:03:36 +0000 (23:03 +0000)
common/amd64/mc-a2.asm		patch \| blob \| history
common/common.c		patch \| blob \| history
common/common.h		patch \| blob \| history
common/frame.c		patch \| blob \| history
common/frame.h		patch \| blob \| history
common/i386/mc-a2.asm		patch \| blob \| history
common/macroblock.c		patch \| blob \| history
common/mc.c		patch \| blob \| history
doc/threads.txt	[new file with mode: 0644]	patch \| blob
encoder/analyse.c		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
encoder/me.c		patch \| blob \| history
encoder/ratecontrol.c		patch \| blob \| history
encoder/ratecontrol.h		patch \| blob \| history
encoder/slicetype.c		patch \| blob \| history
x264.c		patch \| blob \| history
x264.h		patch \| blob \| history