2-pass VBV support and improved VBV handling

[x264] / common / frame.c
diff --git a/common/frame.c b/common/frame.c

index 02d4da48638cbc25e5935a15dbddd93e891b7a1e..70bcf8a0abf5e7ddec27fa68b665495e06dcbecb 100644 (file)
--- a/common/frame.c
+++ b/common/frame.c
@@ -23,9 +23,6 @@
  
  #include "common.h"
  
-#define PADH 32
-#define PADV 32
-
  x264_frame_t *x264_frame_new( x264_t *h )
  {
      x264_frame_t *frame = x264_malloc( sizeof(x264_frame_t) );
@@ -34,6 +31,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
      int i_mb_count = h->mb.i_mb_count;
      int i_stride, i_width, i_lines;
      int i_padv = PADV << h->param.b_interlaced;
+    int luma_plane_size;
  
      if( !frame ) return NULL;
  
@@ -46,36 +44,32 @@ x264_frame_t *x264_frame_new( x264_t *h )
      if( h->param.b_interlaced )
          i_lines = ( i_lines + 31 ) & -32;
  
+    if( h->param.cpu&X264_CPU_CACHELINE_SPLIT )
+    {
+        int align = h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 64;
+        i_stride = (i_stride + align-1) & -align;
+    }
+
      frame->i_plane = 3;
      for( i = 0; i < 3; i++ )
      {
-        int i_divh = 1;
-        int i_divw = 1;
-        if( i > 0 )
-        {
-            if( h->param.i_csp == X264_CSP_I420 )
-                i_divh = i_divw = 2;
-            else if( h->param.i_csp == X264_CSP_I422 )
-                i_divw = 2;
-        }
-        frame->i_stride[i] = i_stride / i_divw;
-        frame->i_width[i] = i_width / i_divw;
-        frame->i_lines[i] = i_lines / i_divh;
-        CHECKED_MALLOC( frame->buffer[i],
-                        frame->i_stride[i] * ( frame->i_lines[i] + 2*i_padv / i_divh ) );
-
-        frame->plane[i] = ((uint8_t*)frame->buffer[i]) +
-                          frame->i_stride[i] * i_padv / i_divh + PADH / i_divw;
+        frame->i_stride[i] = i_stride >> !!i;
+        frame->i_width[i] = i_width >> !!i;
+        frame->i_lines[i] = i_lines >> !!i;
      }
  
-    frame->filtered[0] = frame->plane[0];
-    for( i = 0; i < 3; i++ )
+    luma_plane_size = (frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ));
+    for( i = 1; i < 3; i++ )
      {
-        CHECKED_MALLOC( frame->buffer[4+i],
-                        frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ) );
-        frame->filtered[i+1] = ((uint8_t*)frame->buffer[4+i]) +
-                                frame->i_stride[0] * i_padv + PADH;
+        CHECKED_MALLOC( frame->buffer[i], luma_plane_size/4 );
+        frame->plane[i] = (uint8_t*)frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
      }
+    /* all 4 luma planes allocated together, since the cacheline split code
+     * requires them to be in-phase wrt cacheline alignment. */
+    CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
+    for( i = 0; i < 4; i++ )
+        frame->filtered[i] = (uint8_t*)frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
+    frame->plane[0] = frame->filtered[0];
  
      if( h->frames.b_have_lowres )
      {
@@ -91,11 +85,11 @@ x264_frame_t *x264_frame_new( x264_t *h )
          }
      }
  
-    if( h->param.analyse.i_me_method == X264_ME_ESA )
+    if( h->param.analyse.i_me_method >= X264_ME_ESA )
      {
-        CHECKED_MALLOC( frame->buffer[7],
+        CHECKED_MALLOC( frame->buffer[3],
                          2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) );
-        frame->integral = (uint16_t*)frame->buffer[7] + frame->i_stride[0] * i_padv + PADH;
+        frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
      }
  
      frame->i_poc = -1;
@@ -139,7 +133,7 @@ fail:
  void x264_frame_delete( x264_frame_t *frame )
  {
      int i, j;
-    for( i = 0; i < 8; i++ )
+    for( i = 0; i < 4; i++ )
          x264_free( frame->buffer[i] );
      for( i = 0; i < 4; i++ )
          x264_free( frame->buffer_lowres[i] );
@@ -158,17 +152,35 @@ void x264_frame_delete( x264_frame_t *frame )
      x264_free( frame );
  }
  
-void x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
+int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
  {
      int i_csp = src->img.i_csp & X264_CSP_MASK;
+    int i;
+    if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
+    {
+        x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
+        return -1;
+    }
+
      dst->i_type     = src->i_type;
      dst->i_qpplus1  = src->i_qpplus1;
      dst->i_pts      = src->i_pts;
  
-    if( i_csp <= X264_CSP_NONE  || i_csp >= X264_CSP_MAX )
-        x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
-    else
-        h->csp.convert[i_csp]( &h->mc, dst, &src->img, h->param.i_width, h->param.i_height );
+    for( i=0; i<3; i++ )
+    {
+        int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
+        uint8_t *plane = src->img.plane[s];
+        int stride = src->img.i_stride[s];
+        int width = h->param.i_width >> !!i;
+        int height = h->param.i_height >> !!i;
+        if( src->img.i_csp & X264_CSP_VFLIP )
+        {
+            plane += (height-1)*stride;
+            stride = -stride;
+        }
+        h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
+    }
+    return 0;
  }
  
  
@@ -226,19 +238,20 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e
  
  void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
  {
-    /* during filtering, 8 extra pixels were filtered on each edge. 
+    /* during filtering, 8 extra pixels were filtered on each edge,
+     * but up to 3 of the horizontal ones may be wrong. 
         we want to expand border from the last filtered pixel */
      int b_start = !mb_y;
      int stride = frame->i_stride[0];
-    int width = 16*h->sps->i_mb_width + 16;
+    int width = 16*h->sps->i_mb_width + 8;
      int height = b_end ? (16*(h->sps->i_mb_height - mb_y) >> h->sh.b_mbaff) + 16 : 16;
-    int padh = PADH - 8;
+    int padh = PADH - 4;
      int padv = PADV - 8;
      int i;
      for( i = 1; i < 4; i++ )
      {
          // buffer: 8 luma, to match the hpel filter
-        uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 8;
+        uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
          if( h->sh.b_mbaff )
          {
              plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
@@ -740,18 +753,26 @@ void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta
  void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
  void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
  
-#ifdef ARCH_X86_64
  void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
  void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
-#else
+void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
+void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
+#ifdef ARCH_X86
  void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
  void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
+void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
  
  void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
  {
      x264_deblock_v8_luma_mmxext( pix,   stride, alpha, beta, tc0   );
      x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
  }
+void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
+{
+    x264_deblock_v8_luma_intra_mmxext( pix,   stride, alpha, beta );
+    x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
+}
  #endif
  #endif
  
@@ -778,17 +799,19 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
          pf->deblock_h_chroma = x264_deblock_h_chroma_mmxext;
          pf->deblock_v_chroma_intra = x264_deblock_v_chroma_intra_mmxext;
          pf->deblock_h_chroma_intra = x264_deblock_h_chroma_intra_mmxext;
-
-#ifdef ARCH_X86_64
+#ifdef ARCH_X86
+        pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
+        pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
+        pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_mmxext;
+        pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_mmxext;
+#endif
          if( cpu&X264_CPU_SSE2 )
          {
              pf->deblock_v_luma = x264_deblock_v_luma_sse2;
              pf->deblock_h_luma = x264_deblock_h_luma_sse2;
+            pf->deblock_v_luma_intra = x264_deblock_v_luma_intra_sse2;
+            pf->deblock_h_luma_intra = x264_deblock_h_luma_intra_sse2;
          }
-#else
-        pf->deblock_v_luma = x264_deblock_v_luma_mmxext;
-        pf->deblock_h_luma = x264_deblock_h_luma_mmxext;
-#endif
      }
  #endif
  
@@ -821,11 +844,39 @@ void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
      x264_pthread_mutex_unlock( &frame->mutex );
  }
  
+void x264_frame_size_estimated_set( x264_t *h, int bits )
+{
+    x264_pthread_mutex_lock( &h->fenc->mutex );
+    x264_ratecontrol_set_estimated_size(h, bits);
+    x264_pthread_mutex_unlock( &h->fenc->mutex );
+}
+
+int x264_frame_size_estimated_get( x264_t const *h)
+{
+    int size;
+    x264_pthread_mutex_lock( &h->fenc->mutex );
+    size = x264_ratecontrol_get_estimated_size(h);
+    x264_pthread_mutex_unlock( &h->fenc->mutex );
+    return size;
+}
+
  #else
  void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed )
  {}
  void x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed )
  {}
+
+void x264_frame_size_estimated_set( x264_t *h, int bits )
+{
+    x264_ratecontrol_set_estimated_size(h, bits);
+}
+
+int x264_frame_size_estimated_get( x264_t const *h)
+{
+    int size;
+    size = x264_ratecontrol_set_estimated_size(h);
+    return size;
+}
  #endif