From fa1e2b746d95575b5c5b8e49fcfcad3ded9a5420 Mon Sep 17 00:00:00 2001
From: Henrik Gramner <henrik@gramner.com>
Date: Mon, 8 Jul 2013 12:06:42 -0700
Subject: [PATCH] Transparent hugepage support

Combine frame and mb data mallocs into a single large malloc.
Additionally, on Linux systems with hugepage support, ask for hugepages on
large mallocs.

This gives a small performance improvement (~0.2-0.9%) on systems without
hugepage support, as well as a small memory footprint reduction.

On recent Linux kernels with hugepage support enabled (set to madvise or
always), it improves performance up to 4% at the cost of about 7-12% more
memory usage on typical settings..

It may help even more on Haswell and other recent CPUs with improved 2MB page
support in hardware.
---
 common/common.c     |  23 +++++-
 common/common.h     |  26 +++++++
 common/frame.c      | 177 ++++++++++++++++++++++----------------------
 common/frame.h      |   1 +
 common/macroblock.c |  69 ++++++++---------
 configure           |   7 +-
 encoder/encoder.c   |   2 +-
 7 files changed, 178 insertions(+), 127 deletions(-)

diff --git a/common/common.c b/common/common.c
index 49215555..4e5ca82f 100644
--- a/common/common.c
+++ b/common/common.c
@@ -32,6 +32,9 @@
 #if HAVE_MALLOC_H
 #include <malloc.h>
 #endif
+#if HAVE_THP
+#include <sys/mman.h>
+#endif
 
 const int x264_bit_depth = BIT_DEPTH;
 
@@ -1183,7 +1186,25 @@ void *x264_malloc( int i_size )
 {
     uint8_t *align_buf = NULL;
 #if HAVE_MALLOC_H
-    align_buf = memalign( NATIVE_ALIGN, i_size );
+#if HAVE_THP
+#define HUGE_PAGE_SIZE 2*1024*1024
+#define HUGE_PAGE_THRESHOLD HUGE_PAGE_SIZE*7/8 /* FIXME: Is this optimal? */
+    /* Attempt to allocate huge pages to reduce TLB misses. */
+    if( i_size >= HUGE_PAGE_THRESHOLD )
+    {
+        align_buf = memalign( HUGE_PAGE_SIZE, i_size );
+        if( align_buf )
+        {
+            /* Round up to the next huge page boundary if we are close enough. */
+            size_t madv_size = (i_size + HUGE_PAGE_SIZE - HUGE_PAGE_THRESHOLD) & ~(HUGE_PAGE_SIZE-1);
+            madvise( align_buf, madv_size, MADV_HUGEPAGE );
+        }
+    }
+    else
+#undef HUGE_PAGE_SIZE
+#undef HUGE_PAGE_THRESHOLD
+#endif
+        align_buf = memalign( NATIVE_ALIGN, i_size );
 #else
     uint8_t *buf = malloc( i_size + (NATIVE_ALIGN-1) + sizeof(void **) );
     if( buf )
diff --git a/common/common.h b/common/common.h
index c1d6a0c8..12e5763b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -54,6 +54,31 @@ do {\
     memset( var, 0, size );\
 } while( 0 )
 
+/* Macros for merging multiple allocations into a single large malloc, for improved
+ * use with huge pages. */
+
+/* Needs to be enough to contain any set of buffers that use combined allocations */
+#define PREALLOC_BUF_SIZE 1024
+
+#define PREALLOC_INIT\
+    int    prealloc_idx = 0;\
+    size_t prealloc_size = 0;\
+    uint8_t **preallocs[PREALLOC_BUF_SIZE];
+
+#define PREALLOC( var, size )\
+do {\
+    var = (void*)prealloc_size;\
+    preallocs[prealloc_idx++] = (uint8_t**)&var;\
+    prealloc_size += ALIGN(size, NATIVE_ALIGN);\
+} while(0)
+
+#define PREALLOC_END( ptr )\
+do {\
+    CHECKED_MALLOC( ptr, prealloc_size );\
+    while( prealloc_idx-- )\
+        *preallocs[prealloc_idx] += (intptr_t)ptr;\
+} while(0)
+
 #define ARRAY_SIZE(array)  (sizeof(array)/sizeof(array[0]))
 
 #define X264_BFRAME_MAX 16
@@ -699,6 +724,7 @@ struct x264_t
          * and won't be copied from one thread to another */
 
         /* mb table */
+        uint8_t *base;                      /* base pointer for all malloced data in this mb */
         int8_t  *type;                      /* mb type */
         uint8_t *partition;                 /* mb partition */
         int8_t  *qp;                        /* mb qp */
diff --git a/common/frame.c b/common/frame.c
index e56da8ef..6203e3bf 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -86,6 +86,7 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
 #endif
 
     CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
+    PREALLOC_INIT
 
     /* allocate frame data (+64 for extra data for me) */
     i_width  = h->mb.i_mb_width*16;
@@ -124,7 +125,7 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
 
     for( int i = 0; i < h->param.i_bframe + 2; i++ )
         for( int j = 0; j < h->param.i_bframe + 2; j++ )
-            CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
+            PREALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
 
     frame->i_poc = -1;
     frame->i_type = X264_TYPE_AUTO;
@@ -149,13 +150,9 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
     {
         int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
         int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*chroma_padv));
-        CHECKED_MALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) );
-        frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH;
+        PREALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) );
         if( PARAM_INTERLACED )
-        {
-            CHECKED_MALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) );
-            frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH;
-        }
+            PREALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) );
     }
 
     /* all 4 luma planes allocated together, since the cacheline split code
@@ -167,24 +164,15 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
         if( h->param.analyse.i_subpel_refine && b_fdec )
         {
             /* FIXME: Don't allocate both buffers in non-adaptive MBAFF. */
-            CHECKED_MALLOC( frame->buffer[p], 4*luma_plane_size * sizeof(pixel) );
+            PREALLOC( frame->buffer[p], 4*luma_plane_size * sizeof(pixel) );
             if( PARAM_INTERLACED )
-                CHECKED_MALLOC( frame->buffer_fld[p], 4*luma_plane_size * sizeof(pixel) );
-            for( int i = 0; i < 4; i++ )
-            {
-                frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH;
-                frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH;
-            }
-            frame->plane[p] = frame->filtered[p][0];
-            frame->plane_fld[p] = frame->filtered_fld[p][0];
+                PREALLOC( frame->buffer_fld[p], 4*luma_plane_size * sizeof(pixel) );
         }
         else
         {
-            CHECKED_MALLOC( frame->buffer[p], luma_plane_size * sizeof(pixel) );
+            PREALLOC( frame->buffer[p], luma_plane_size * sizeof(pixel) );
             if( PARAM_INTERLACED )
-                CHECKED_MALLOC( frame->buffer_fld[p], luma_plane_size * sizeof(pixel) );
-            frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH;
-            frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH;
+                PREALLOC( frame->buffer_fld[p], luma_plane_size * sizeof(pixel) );
         }
     }
 
@@ -192,36 +180,30 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
 
     if( b_fdec ) /* fdec frame */
     {
-        CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
-        CHECKED_MALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t));
-        CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
-        CHECKED_MALLOC( frame->mv16x16, 2*(i_mb_count+1) * sizeof(int16_t) );
-        M32( frame->mv16x16[0] ) = 0;
-        frame->mv16x16++;
-        CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
+        PREALLOC( frame->mb_type, i_mb_count * sizeof(int8_t) );
+        PREALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t) );
+        PREALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
+        PREALLOC( frame->mv16x16, 2*(i_mb_count+1) * sizeof(int16_t) );
+        PREALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
         if( h->param.i_bframe )
         {
-            CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
-            CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
+            PREALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
+            PREALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
         }
         else
         {
             frame->mv[1]  = NULL;
             frame->ref[1] = NULL;
         }
-        CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
-        CHECKED_MALLOC( frame->f_row_qp, i_lines/16 * sizeof(float) );
-        CHECKED_MALLOC( frame->f_row_qscale, i_lines/16 * sizeof(float) );
+        PREALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
+        PREALLOC( frame->f_row_qp, i_lines/16 * sizeof(float) );
+        PREALLOC( frame->f_row_qscale, i_lines/16 * sizeof(float) );
         if( h->param.analyse.i_me_method >= X264_ME_ESA )
-        {
-            CHECKED_MALLOC( frame->buffer[3],
-                            frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
-            frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
-        }
+            PREALLOC( frame->buffer[3], frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
         if( PARAM_INTERLACED )
-            CHECKED_MALLOC( frame->field, i_mb_count * sizeof(uint8_t) );
+            PREALLOC( frame->field, i_mb_count * sizeof(uint8_t) );
         if( h->param.analyse.b_mb_info )
-            CHECKED_MALLOC( frame->effective_qp, i_mb_count * sizeof(uint8_t) );
+            PREALLOC( frame->effective_qp, i_mb_count * sizeof(uint8_t) );
     }
     else /* fenc frame */
     {
@@ -229,30 +211,85 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
         {
             int luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
 
-            CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size * sizeof(pixel) );
-            for( int i = 0; i < 4; i++ )
-                frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;
+            PREALLOC( frame->buffer_lowres[0], 4 * luma_plane_size * sizeof(pixel) );
 
             for( int j = 0; j <= !!h->param.i_bframe; j++ )
                 for( int i = 0; i <= h->param.i_bframe; i++ )
                 {
-                    CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
-                    CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
+                    PREALLOC( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
+                    PREALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
                 }
-            CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) );
+            PREALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) );
             for( int j = 0; j <= h->param.i_bframe+1; j++ )
                 for( int i = 0; i <= h->param.i_bframe+1; i++ )
-                    CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
-            frame->i_intra_cost = frame->lowres_costs[0][0];
-            memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
+                    PREALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
+
         }
         if( h->param.rc.i_aq_mode )
         {
-            CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
-            CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
+            PREALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
+            PREALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
             if( h->frames.b_have_lowres )
+                PREALLOC( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
+        }
+    }
+
+    PREALLOC_END( frame->base );
+
+    if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
+    {
+        int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
+        frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH;
+        if( PARAM_INTERLACED )
+            frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH;
+    }
+
+    for( int p = 0; p < luma_plane_count; p++ )
+    {
+        int luma_plane_size = align_plane_size( frame->i_stride[p] * (frame->i_lines[p] + 2*i_padv), disalign );
+        if( h->param.analyse.i_subpel_refine && b_fdec )
+        {
+            for( int i = 0; i < 4; i++ )
+            {
+                frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH;
+                frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH;
+            }
+            frame->plane[p] = frame->filtered[p][0];
+            frame->plane_fld[p] = frame->filtered_fld[p][0];
+        }
+        else
+        {
+            frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH;
+            frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH;
+        }
+    }
+
+    if( b_fdec )
+    {
+        M32( frame->mv16x16[0] ) = 0;
+        frame->mv16x16++;
+
+        if( h->param.analyse.i_me_method >= X264_ME_ESA )
+            frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
+    }
+    else
+    {
+        if( h->frames.b_have_lowres )
+        {
+            int luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
+            for( int i = 0; i < 4; i++ )
+                frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;
+
+            for( int j = 0; j <= !!h->param.i_bframe; j++ )
+                for( int i = 0; i <= h->param.i_bframe; i++ )
+                    memset( frame->lowres_mvs[j][i], 0, 2*h->mb.i_mb_count*sizeof(int16_t) );
+
+            frame->i_intra_cost = frame->lowres_costs[0][0];
+            memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
+
+            if( h->param.rc.i_aq_mode )
                 /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
-                CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
+                memset( frame->i_inv_qscale_factor, 0, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
         }
     }
 
@@ -278,42 +315,8 @@ void x264_frame_delete( x264_frame_t *frame )
      * so freeing those pointers would cause a double free later. */
     if( !frame->b_duplicate )
     {
-        for( int i = 0; i < 4; i++ )
-        {
-            x264_free( frame->buffer[i] );
-            x264_free( frame->buffer_fld[i] );
-        }
-        for( int i = 0; i < 4; i++ )
-            x264_free( frame->buffer_lowres[i] );
-        for( int i = 0; i < X264_BFRAME_MAX+2; i++ )
-            for( int j = 0; j < X264_BFRAME_MAX+2; j++ )
-                x264_free( frame->i_row_satds[i][j] );
-        for( int j = 0; j < 2; j++ )
-            for( int i = 0; i <= X264_BFRAME_MAX; i++ )
-            {
-                x264_free( frame->lowres_mvs[j][i] );
-                x264_free( frame->lowres_mv_costs[j][i] );
-            }
-        x264_free( frame->i_propagate_cost );
-        for( int j = 0; j <= X264_BFRAME_MAX+1; j++ )
-            for( int i = 0; i <= X264_BFRAME_MAX+1; i++ )
-                x264_free( frame->lowres_costs[j][i] );
-        x264_free( frame->f_qp_offset );
-        x264_free( frame->f_qp_offset_aq );
-        x264_free( frame->i_inv_qscale_factor );
-        x264_free( frame->i_row_bits );
-        x264_free( frame->f_row_qp );
-        x264_free( frame->f_row_qscale );
-        x264_free( frame->field );
-        x264_free( frame->effective_qp );
-        x264_free( frame->mb_type );
-        x264_free( frame->mb_partition );
-        x264_free( frame->mv[0] );
-        x264_free( frame->mv[1] );
-        if( frame->mv16x16 )
-            x264_free( frame->mv16x16-1 );
-        x264_free( frame->ref[0] );
-        x264_free( frame->ref[1] );
+        x264_free( frame->base );
+
         if( frame->param && frame->param->param_free )
             frame->param->param_free( frame->param );
         if( frame->mb_info_free )
diff --git a/common/frame.h b/common/frame.h
index 72c1fa3a..d8416a5d 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -35,6 +35,7 @@
 typedef struct x264_frame
 {
     /* */
+    uint8_t *base;       /* Base pointer for all malloced data in this frame. */
     int     i_poc;
     int     i_delta_poc[2];
     int     i_type;
diff --git a/common/macroblock.c b/common/macroblock.c
index 2d3e7e14..45405fde 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -256,25 +256,26 @@ int x264_macroblock_cache_allocate( x264_t *h )
 
     h->mb.b_interlaced = PARAM_INTERLACED;
 
-    CHECKED_MALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) );
-    CHECKED_MALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) );
-    CHECKED_MALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
-    CHECKED_MALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) );
-    memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) );
+    PREALLOC_INIT
+
+    PREALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) );
+    PREALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) );
+    PREALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
+    PREALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) );
 
     /* 0 -> 3 top(4), 4 -> 6 : left(3) */
-    CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
+    PREALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
 
     /* all coeffs */
-    CHECKED_MALLOC( h->mb.non_zero_count, i_mb_count * 48 * sizeof(uint8_t) );
+    PREALLOC( h->mb.non_zero_count, i_mb_count * 48 * sizeof(uint8_t) );
 
     if( h->param.b_cabac )
     {
-        CHECKED_MALLOC( h->mb.skipbp, i_mb_count * sizeof(int8_t) );
-        CHECKED_MALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) );
-        CHECKED_MALLOC( h->mb.mvd[0], i_mb_count * sizeof( **h->mb.mvd ) );
+        PREALLOC( h->mb.skipbp, i_mb_count * sizeof(int8_t) );
+        PREALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) );
+        PREALLOC( h->mb.mvd[0], i_mb_count * sizeof( **h->mb.mvd ) );
         if( h->param.i_bframe )
-            CHECKED_MALLOC( h->mb.mvd[1], i_mb_count * sizeof( **h->mb.mvd ) );
+            PREALLOC( h->mb.mvd[1], i_mb_count * sizeof( **h->mb.mvd ) );
     }
 
     for( int i = 0; i < 2; i++ )
@@ -284,11 +285,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
             i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit
 
         for( int j = !i; j < i_refs; j++ )
-        {
-            CHECKED_MALLOC( h->mb.mvr[i][j], 2 * (i_mb_count + 1) * sizeof(int16_t) );
-            M32( h->mb.mvr[i][j][0] ) = 0;
-            h->mb.mvr[i][j]++;
-        }
+            PREALLOC( h->mb.mvr[i][j], 2 * (i_mb_count + 1) * sizeof(int16_t) );
     }
 
     if( h->param.analyse.i_weighted_pred )
@@ -325,7 +322,24 @@ int x264_macroblock_cache_allocate( x264_t *h )
         }
 
         for( int i = 0; i < numweightbuf; i++ )
-            CHECKED_MALLOC( h->mb.p_weight_buf[i], luma_plane_size * sizeof(pixel) );
+            PREALLOC( h->mb.p_weight_buf[i], luma_plane_size * sizeof(pixel) );
+    }
+
+    PREALLOC_END( h->mb.base );
+
+    memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) );
+
+    for( int i = 0; i < 2; i++ )
+    {
+        int i_refs = X264_MIN(X264_REF_MAX, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << PARAM_INTERLACED;
+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+            i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit
+
+        for( int j = !i; j < i_refs; j++ )
+        {
+            M32( h->mb.mvr[i][j][0] ) = 0;
+            h->mb.mvr[i][j]++;
+        }
     }
 
     return 0;
@@ -334,26 +348,7 @@ fail:
 }
 void x264_macroblock_cache_free( x264_t *h )
 {
-    for( int i = 0; i < 2; i++ )
-        for( int j = !i; j < X264_REF_MAX*2; j++ )
-            if( h->mb.mvr[i][j] )
-                x264_free( h->mb.mvr[i][j]-1 );
-    for( int i = 0; i < X264_REF_MAX; i++ )
-        x264_free( h->mb.p_weight_buf[i] );
-
-    if( h->param.b_cabac )
-    {
-        x264_free( h->mb.skipbp );
-        x264_free( h->mb.chroma_pred_mode );
-        x264_free( h->mb.mvd[0] );
-        x264_free( h->mb.mvd[1] );
-    }
-    x264_free( h->mb.slice_table );
-    x264_free( h->mb.intra4x4_pred_mode );
-    x264_free( h->mb.non_zero_count );
-    x264_free( h->mb.mb_transform_size );
-    x264_free( h->mb.cbp );
-    x264_free( h->mb.qp );
+    x264_free( h->mb.base );
 }
 
 int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
diff --git a/configure b/configure
index db1d9e77..6f3ac025 100755
--- a/configure
+++ b/configure
@@ -290,7 +290,8 @@ cross_prefix=""
 EXE=""
 
 # list of all preprocessor HAVE values we can define
-CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL"
+CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE \
+             LAVF FFMS GPAC GF_MALLOC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP"
 
 # parse options
 
@@ -820,6 +821,10 @@ if cc_check "math.h" "-Werror" "return log2f(2);" ; then
     define HAVE_LOG2F
 fi
 
+if [ "$SYS" = "LINUX" -a \( "$ARCH" = "X86" -o "$ARCH" = "X86_64" \) ] && cc_check "sys/mman.h" "" "MADV_HUGEPAGE;" ; then
+    define HAVE_THP
+fi
+
 if [ "$vis" = "yes" ] ; then
     save_CFLAGS="$CFLAGS"
     CFLAGS="$CFLAGS -I/usr/X11R6/include"
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 167daa9a..9ff8acf1 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -2717,7 +2717,7 @@ static void x264_thread_sync_context( x264_t *dst, x264_t *src )
     x264_frame_push_unused( src, dst->fdec );
 
     // copy everything except the per-thread pointers and the constants.
-    memcpy( &dst->i_frame, &src->i_frame, offsetof(x264_t, mb.type) - offsetof(x264_t, i_frame) );
+    memcpy( &dst->i_frame, &src->i_frame, offsetof(x264_t, mb.base) - offsetof(x264_t, i_frame) );
     dst->param = src->param;
     dst->stat = src->stat;
     dst->pixf = src->pixf;
-- 
2.39.2