Transparent hugepage support

author Henrik Gramner <henrik@gramner.com>

Mon, 8 Jul 2013 19:06:42 +0000 (12:06 -0700)

committer Fiona Glaser <fiona@x264.com>

Fri, 23 Aug 2013 21:04:12 +0000 (14:04 -0700)
author Henrik Gramner <henrik@gramner.com>
Mon, 8 Jul 2013 19:06:42 +0000 (12:06 -0700)
committer Fiona Glaser <fiona@x264.com>
Fri, 23 Aug 2013 21:04:12 +0000 (14:04 -0700)
diff --git a/common/common.c b/common/common.c

index 4921555537067cd1d88c08c12d8c2d94b7ba3e37..4e5ca82f0af013b31eb67f00622223e5691f2e01 100644 (file)
--- a/common/common.c
+++ b/common/common.c
@@ -32,6 +32,9 @@
  #if HAVE_MALLOC_H
  #include <malloc.h>
  #endif
+#if HAVE_THP
+#include <sys/mman.h>
+#endif
  
  const int x264_bit_depth = BIT_DEPTH;
  
@@ -1183,7 +1186,25 @@ void *x264_malloc( int i_size )
  {
      uint8_t *align_buf = NULL;
  #if HAVE_MALLOC_H
-    align_buf = memalign( NATIVE_ALIGN, i_size );
+#if HAVE_THP
+#define HUGE_PAGE_SIZE 2*1024*1024
+#define HUGE_PAGE_THRESHOLD HUGE_PAGE_SIZE*7/8 /* FIXME: Is this optimal? */
+    /* Attempt to allocate huge pages to reduce TLB misses. */
+    if( i_size >= HUGE_PAGE_THRESHOLD )
+    {
+        align_buf = memalign( HUGE_PAGE_SIZE, i_size );
+        if( align_buf )
+        {
+            /* Round up to the next huge page boundary if we are close enough. */
+            size_t madv_size = (i_size + HUGE_PAGE_SIZE - HUGE_PAGE_THRESHOLD) & ~(HUGE_PAGE_SIZE-1);
+            madvise( align_buf, madv_size, MADV_HUGEPAGE );
+        }
+    }
+    else
+#undef HUGE_PAGE_SIZE
+#undef HUGE_PAGE_THRESHOLD
+#endif
+        align_buf = memalign( NATIVE_ALIGN, i_size );
  #else
      uint8_t *buf = malloc( i_size + (NATIVE_ALIGN-1) + sizeof(void **) );
      if( buf )
diff --git a/common/common.h b/common/common.h

index c1d6a0c8c2dfdbf680c6027b6b353bdabe2c5df9..12e5763bb1bd6593e1672e65af97ec46074dfe59 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -54,6 +54,31 @@ do {\
      memset( var, 0, size );\
  } while( 0 )
  
+/* Macros for merging multiple allocations into a single large malloc, for improved
+ * use with huge pages. */
+
+/* Needs to be enough to contain any set of buffers that use combined allocations */
+#define PREALLOC_BUF_SIZE 1024
+
+#define PREALLOC_INIT\
+    int    prealloc_idx = 0;\
+    size_t prealloc_size = 0;\
+    uint8_t **preallocs[PREALLOC_BUF_SIZE];
+
+#define PREALLOC( var, size )\
+do {\
+    var = (void*)prealloc_size;\
+    preallocs[prealloc_idx++] = (uint8_t**)&var;\
+    prealloc_size += ALIGN(size, NATIVE_ALIGN);\
+} while(0)
+
+#define PREALLOC_END( ptr )\
+do {\
+    CHECKED_MALLOC( ptr, prealloc_size );\
+    while( prealloc_idx-- )\
+        *preallocs[prealloc_idx] += (intptr_t)ptr;\
+} while(0)
+
  #define ARRAY_SIZE(array)  (sizeof(array)/sizeof(array[0]))
  
  #define X264_BFRAME_MAX 16
@@ -699,6 +724,7 @@ struct x264_t
           * and won't be copied from one thread to another */
  
          /* mb table */
+        uint8_t *base;                      /* base pointer for all malloced data in this mb */
          int8_t  *type;                      /* mb type */
          uint8_t *partition;                 /* mb partition */
          int8_t  *qp;                        /* mb qp */
diff --git a/common/frame.c b/common/frame.c

index e56da8efda48e9967cc401b88303aa1b0479bfad..6203e3bf0ea383e9b83dbf365795bde7733118c5 100644 (file)
--- a/common/frame.c
+++ b/common/frame.c
@@ -86,6 +86,7 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
  #endif
  
      CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
+    PREALLOC_INIT
  
      /* allocate frame data (+64 for extra data for me) */
      i_width  = h->mb.i_mb_width*16;
@@ -124,7 +125,7 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
  
      for( int i = 0; i < h->param.i_bframe + 2; i++ )
          for( int j = 0; j < h->param.i_bframe + 2; j++ )
-            CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
+            PREALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
  
      frame->i_poc = -1;
      frame->i_type = X264_TYPE_AUTO;
@@ -149,13 +150,9 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
      {
          int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
          int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*chroma_padv));
-        CHECKED_MALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) );
-        frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH;
+        PREALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) );
          if( PARAM_INTERLACED )
-        {
-            CHECKED_MALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) );
-            frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH;
-        }
+            PREALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) );
      }
  
      /* all 4 luma planes allocated together, since the cacheline split code
@@ -167,24 +164,15 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
          if( h->param.analyse.i_subpel_refine && b_fdec )
          {
              /* FIXME: Don't allocate both buffers in non-adaptive MBAFF. */
-            CHECKED_MALLOC( frame->buffer[p], 4*luma_plane_size * sizeof(pixel) );
+            PREALLOC( frame->buffer[p], 4*luma_plane_size * sizeof(pixel) );
              if( PARAM_INTERLACED )
-                CHECKED_MALLOC( frame->buffer_fld[p], 4*luma_plane_size * sizeof(pixel) );
-            for( int i = 0; i < 4; i++ )
-            {
-                frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH;
-                frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH;
-            }
-            frame->plane[p] = frame->filtered[p][0];
-            frame->plane_fld[p] = frame->filtered_fld[p][0];
+                PREALLOC( frame->buffer_fld[p], 4*luma_plane_size * sizeof(pixel) );
          }
          else
          {
-            CHECKED_MALLOC( frame->buffer[p], luma_plane_size * sizeof(pixel) );
+            PREALLOC( frame->buffer[p], luma_plane_size * sizeof(pixel) );
              if( PARAM_INTERLACED )
-                CHECKED_MALLOC( frame->buffer_fld[p], luma_plane_size * sizeof(pixel) );
-            frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH;
-            frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH;
+                PREALLOC( frame->buffer_fld[p], luma_plane_size * sizeof(pixel) );
          }
      }
  
@@ -192,36 +180,30 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
  
      if( b_fdec ) /* fdec frame */
      {
-        CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
-        CHECKED_MALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t));
-        CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
-        CHECKED_MALLOC( frame->mv16x16, 2*(i_mb_count+1) * sizeof(int16_t) );
-        M32( frame->mv16x16[0] ) = 0;
-        frame->mv16x16++;
-        CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
+        PREALLOC( frame->mb_type, i_mb_count * sizeof(int8_t) );
+        PREALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t) );
+        PREALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
+        PREALLOC( frame->mv16x16, 2*(i_mb_count+1) * sizeof(int16_t) );
+        PREALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
          if( h->param.i_bframe )
          {
-            CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
-            CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
+            PREALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
+            PREALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
          }
          else
          {
              frame->mv[1]  = NULL;
              frame->ref[1] = NULL;
          }
-        CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
-        CHECKED_MALLOC( frame->f_row_qp, i_lines/16 * sizeof(float) );
-        CHECKED_MALLOC( frame->f_row_qscale, i_lines/16 * sizeof(float) );
+        PREALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
+        PREALLOC( frame->f_row_qp, i_lines/16 * sizeof(float) );
+        PREALLOC( frame->f_row_qscale, i_lines/16 * sizeof(float) );
          if( h->param.analyse.i_me_method >= X264_ME_ESA )
-        {
-            CHECKED_MALLOC( frame->buffer[3],
-                            frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
-            frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
-        }
+            PREALLOC( frame->buffer[3], frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
          if( PARAM_INTERLACED )
-            CHECKED_MALLOC( frame->field, i_mb_count * sizeof(uint8_t) );
+            PREALLOC( frame->field, i_mb_count * sizeof(uint8_t) );
          if( h->param.analyse.b_mb_info )
-            CHECKED_MALLOC( frame->effective_qp, i_mb_count * sizeof(uint8_t) );
+            PREALLOC( frame->effective_qp, i_mb_count * sizeof(uint8_t) );
      }
      else /* fenc frame */
      {
@@ -229,30 +211,85 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
          {
              int luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
  
-            CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size * sizeof(pixel) );
-            for( int i = 0; i < 4; i++ )
-                frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;
+            PREALLOC( frame->buffer_lowres[0], 4 * luma_plane_size * sizeof(pixel) );
  
              for( int j = 0; j <= !!h->param.i_bframe; j++ )
                  for( int i = 0; i <= h->param.i_bframe; i++ )
                  {
-                    CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
-                    CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
+                    PREALLOC( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
+                    PREALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
                  }
-            CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) );
+            PREALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) );
              for( int j = 0; j <= h->param.i_bframe+1; j++ )
                  for( int i = 0; i <= h->param.i_bframe+1; i++ )
-                    CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
-            frame->i_intra_cost = frame->lowres_costs[0][0];
-            memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
+                    PREALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
+
          }
          if( h->param.rc.i_aq_mode )
          {
-            CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
-            CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
+            PREALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
+            PREALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
              if( h->frames.b_have_lowres )
+                PREALLOC( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
+        }
+    }
+
+    PREALLOC_END( frame->base );
+
+    if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
+    {
+        int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
+        frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH;
+        if( PARAM_INTERLACED )
+            frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH;
+    }
+
+    for( int p = 0; p < luma_plane_count; p++ )
+    {
+        int luma_plane_size = align_plane_size( frame->i_stride[p] * (frame->i_lines[p] + 2*i_padv), disalign );
+        if( h->param.analyse.i_subpel_refine && b_fdec )
+        {
+            for( int i = 0; i < 4; i++ )
+            {
+                frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH;
+                frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH;
+            }
+            frame->plane[p] = frame->filtered[p][0];
+            frame->plane_fld[p] = frame->filtered_fld[p][0];
+        }
+        else
+        {
+            frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH;
+            frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH;
+        }
+    }
+
+    if( b_fdec )
+    {
+        M32( frame->mv16x16[0] ) = 0;
+        frame->mv16x16++;
+
+        if( h->param.analyse.i_me_method >= X264_ME_ESA )
+            frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
+    }
+    else
+    {
+        if( h->frames.b_have_lowres )
+        {
+            int luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
+            for( int i = 0; i < 4; i++ )
+                frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;
+
+            for( int j = 0; j <= !!h->param.i_bframe; j++ )
+                for( int i = 0; i <= h->param.i_bframe; i++ )
+                    memset( frame->lowres_mvs[j][i], 0, 2*h->mb.i_mb_count*sizeof(int16_t) );
+
+            frame->i_intra_cost = frame->lowres_costs[0][0];
+            memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
+
+            if( h->param.rc.i_aq_mode )
                  /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
-                CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
+                memset( frame->i_inv_qscale_factor, 0, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
          }
      }
  
@@ -278,42 +315,8 @@ void x264_frame_delete( x264_frame_t *frame )
       * so freeing those pointers would cause a double free later. */
      if( !frame->b_duplicate )
      {
-        for( int i = 0; i < 4; i++ )
-        {
-            x264_free( frame->buffer[i] );
-            x264_free( frame->buffer_fld[i] );
-        }
-        for( int i = 0; i < 4; i++ )
-            x264_free( frame->buffer_lowres[i] );
-        for( int i = 0; i < X264_BFRAME_MAX+2; i++ )
-            for( int j = 0; j < X264_BFRAME_MAX+2; j++ )
-                x264_free( frame->i_row_satds[i][j] );
-        for( int j = 0; j < 2; j++ )
-            for( int i = 0; i <= X264_BFRAME_MAX; i++ )
-            {
-                x264_free( frame->lowres_mvs[j][i] );
-                x264_free( frame->lowres_mv_costs[j][i] );
-            }
-        x264_free( frame->i_propagate_cost );
-        for( int j = 0; j <= X264_BFRAME_MAX+1; j++ )
-            for( int i = 0; i <= X264_BFRAME_MAX+1; i++ )
-                x264_free( frame->lowres_costs[j][i] );
-        x264_free( frame->f_qp_offset );
-        x264_free( frame->f_qp_offset_aq );
-        x264_free( frame->i_inv_qscale_factor );
-        x264_free( frame->i_row_bits );
-        x264_free( frame->f_row_qp );
-        x264_free( frame->f_row_qscale );
-        x264_free( frame->field );
-        x264_free( frame->effective_qp );
-        x264_free( frame->mb_type );
-        x264_free( frame->mb_partition );
-        x264_free( frame->mv[0] );
-        x264_free( frame->mv[1] );
-        if( frame->mv16x16 )
-            x264_free( frame->mv16x16-1 );
-        x264_free( frame->ref[0] );
-        x264_free( frame->ref[1] );
+        x264_free( frame->base );
+
          if( frame->param && frame->param->param_free )
              frame->param->param_free( frame->param );
          if( frame->mb_info_free )
diff --git a/common/frame.h b/common/frame.h

index 72c1fa3a56c7f4a758e61bf05d367e0e55b543b7..d8416a5df52f03b7f668874dba808718b0b36ce5 100644 (file)
--- a/common/frame.h
+++ b/common/frame.h
@@ -35,6 +35,7 @@
  typedef struct x264_frame
  {
      /* */
+    uint8_t *base;       /* Base pointer for all malloced data in this frame. */
      int     i_poc;
      int     i_delta_poc[2];
      int     i_type;
diff --git a/common/macroblock.c b/common/macroblock.c

index 2d3e7e145f6a85ee511a8b68324a232e081f04a1..45405fde51050e88793b138a82f1ee46ddf58f82 100644 (file)
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -256,25 +256,26 @@ int x264_macroblock_cache_allocate( x264_t *h )
  
      h->mb.b_interlaced = PARAM_INTERLACED;
  
-    CHECKED_MALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) );
-    CHECKED_MALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) );
-    CHECKED_MALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
-    CHECKED_MALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) );
-    memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) );
+    PREALLOC_INIT
+
+    PREALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) );
+    PREALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) );
+    PREALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
+    PREALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) );
  
      /* 0 -> 3 top(4), 4 -> 6 : left(3) */
-    CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
+    PREALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
  
      /* all coeffs */
-    CHECKED_MALLOC( h->mb.non_zero_count, i_mb_count * 48 * sizeof(uint8_t) );
+    PREALLOC( h->mb.non_zero_count, i_mb_count * 48 * sizeof(uint8_t) );
  
      if( h->param.b_cabac )
      {
-        CHECKED_MALLOC( h->mb.skipbp, i_mb_count * sizeof(int8_t) );
-        CHECKED_MALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) );
-        CHECKED_MALLOC( h->mb.mvd[0], i_mb_count * sizeof( **h->mb.mvd ) );
+        PREALLOC( h->mb.skipbp, i_mb_count * sizeof(int8_t) );
+        PREALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) );
+        PREALLOC( h->mb.mvd[0], i_mb_count * sizeof( **h->mb.mvd ) );
          if( h->param.i_bframe )
-            CHECKED_MALLOC( h->mb.mvd[1], i_mb_count * sizeof( **h->mb.mvd ) );
+            PREALLOC( h->mb.mvd[1], i_mb_count * sizeof( **h->mb.mvd ) );
      }
  
      for( int i = 0; i < 2; i++ )
@@ -284,11 +285,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
              i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit
  
          for( int j = !i; j < i_refs; j++ )
-        {
-            CHECKED_MALLOC( h->mb.mvr[i][j], 2 * (i_mb_count + 1) * sizeof(int16_t) );
-            M32( h->mb.mvr[i][j][0] ) = 0;
-            h->mb.mvr[i][j]++;
-        }
+            PREALLOC( h->mb.mvr[i][j], 2 * (i_mb_count + 1) * sizeof(int16_t) );
      }
  
      if( h->param.analyse.i_weighted_pred )
@@ -325,7 +322,24 @@ int x264_macroblock_cache_allocate( x264_t *h )
          }
  
          for( int i = 0; i < numweightbuf; i++ )
-            CHECKED_MALLOC( h->mb.p_weight_buf[i], luma_plane_size * sizeof(pixel) );
+            PREALLOC( h->mb.p_weight_buf[i], luma_plane_size * sizeof(pixel) );
+    }
+
+    PREALLOC_END( h->mb.base );
+
+    memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) );
+
+    for( int i = 0; i < 2; i++ )
+    {
+        int i_refs = X264_MIN(X264_REF_MAX, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << PARAM_INTERLACED;
+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+            i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit
+
+        for( int j = !i; j < i_refs; j++ )
+        {
+            M32( h->mb.mvr[i][j][0] ) = 0;
+            h->mb.mvr[i][j]++;
+        }
      }
  
      return 0;
@@ -334,26 +348,7 @@ fail:
  }
  void x264_macroblock_cache_free( x264_t *h )
  {
-    for( int i = 0; i < 2; i++ )
-        for( int j = !i; j < X264_REF_MAX*2; j++ )
-            if( h->mb.mvr[i][j] )
-                x264_free( h->mb.mvr[i][j]-1 );
-    for( int i = 0; i < X264_REF_MAX; i++ )
-        x264_free( h->mb.p_weight_buf[i] );
-
-    if( h->param.b_cabac )
-    {
-        x264_free( h->mb.skipbp );
-        x264_free( h->mb.chroma_pred_mode );
-        x264_free( h->mb.mvd[0] );
-        x264_free( h->mb.mvd[1] );
-    }
-    x264_free( h->mb.slice_table );
-    x264_free( h->mb.intra4x4_pred_mode );
-    x264_free( h->mb.non_zero_count );
-    x264_free( h->mb.mb_transform_size );
-    x264_free( h->mb.cbp );
-    x264_free( h->mb.qp );
+    x264_free( h->mb.base );
  }
  
  int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
diff --git a/configure b/configure

index db1d9e77e4fa8811ec483d3d17f53ef034776029..6f3ac025c63aea70fe3590c8394d7a0b97082da3 100755 (executable)
--- a/configure
+++ b/configure
@@ -290,7 +290,8 @@ cross_prefix=""
  EXE=""
  
  # list of all preprocessor HAVE values we can define
-CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL"
+CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE \
+             LAVF FFMS GPAC GF_MALLOC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP"
  
  # parse options
  
@@ -820,6 +821,10 @@ if cc_check "math.h" "-Werror" "return log2f(2);" ; then
      define HAVE_LOG2F
  fi
  
+if [ "$SYS" = "LINUX" -a \( "$ARCH" = "X86" -o "$ARCH" = "X86_64" \) ] && cc_check "sys/mman.h" "" "MADV_HUGEPAGE;" ; then
+    define HAVE_THP
+fi
+
  if [ "$vis" = "yes" ] ; then
      save_CFLAGS="$CFLAGS"
      CFLAGS="$CFLAGS -I/usr/X11R6/include"
diff --git a/encoder/encoder.c b/encoder/encoder.c

index 167daa9acc2963d687fee904ce2ed5daec97a0c7..9ff8acf1e85ea1f10808281d0ba79cd81d608b74 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -2717,7 +2717,7 @@ static void x264_thread_sync_context( x264_t *dst, x264_t *src )
      x264_frame_push_unused( src, dst->fdec );
  
      // copy everything except the per-thread pointers and the constants.
-    memcpy( &dst->i_frame, &src->i_frame, offsetof(x264_t, mb.type) - offsetof(x264_t, i_frame) );
+    memcpy( &dst->i_frame, &src->i_frame, offsetof(x264_t, mb.base) - offsetof(x264_t, i_frame) );
      dst->param = src->param;
      dst->stat = src->stat;
      dst->pixf = src->pixf;
author	Henrik Gramner <henrik@gramner.com>
	Mon, 8 Jul 2013 19:06:42 +0000 (12:06 -0700)
committer	Fiona Glaser <fiona@x264.com>
	Fri, 23 Aug 2013 21:04:12 +0000 (14:04 -0700)
common/common.c		patch \| blob \| history
common/common.h		patch \| blob \| history
common/frame.c		patch \| blob \| history
common/frame.h		patch \| blob \| history
common/macroblock.c		patch \| blob \| history
configure		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history