From fa1e2b746d95575b5c5b8e49fcfcad3ded9a5420 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Mon, 8 Jul 2013 12:06:42 -0700 Subject: [PATCH] Transparent hugepage support Combine frame and mb data mallocs into a single large malloc. Additionally, on Linux systems with hugepage support, ask for hugepages on large mallocs. This gives a small performance improvement (~0.2-0.9%) on systems without hugepage support, as well as a small memory footprint reduction. On recent Linux kernels with hugepage support enabled (set to madvise or always), it improves performance up to 4% at the cost of about 7-12% more memory usage on typical settings.. It may help even more on Haswell and other recent CPUs with improved 2MB page support in hardware. --- common/common.c | 23 +++++- common/common.h | 26 +++++++ common/frame.c | 177 ++++++++++++++++++++++---------------------- common/frame.h | 1 + common/macroblock.c | 69 ++++++++--------- configure | 7 +- encoder/encoder.c | 2 +- 7 files changed, 178 insertions(+), 127 deletions(-) diff --git a/common/common.c b/common/common.c index 49215555..4e5ca82f 100644 --- a/common/common.c +++ b/common/common.c @@ -32,6 +32,9 @@ #if HAVE_MALLOC_H #include #endif +#if HAVE_THP +#include +#endif const int x264_bit_depth = BIT_DEPTH; @@ -1183,7 +1186,25 @@ void *x264_malloc( int i_size ) { uint8_t *align_buf = NULL; #if HAVE_MALLOC_H - align_buf = memalign( NATIVE_ALIGN, i_size ); +#if HAVE_THP +#define HUGE_PAGE_SIZE 2*1024*1024 +#define HUGE_PAGE_THRESHOLD HUGE_PAGE_SIZE*7/8 /* FIXME: Is this optimal? */ + /* Attempt to allocate huge pages to reduce TLB misses. */ + if( i_size >= HUGE_PAGE_THRESHOLD ) + { + align_buf = memalign( HUGE_PAGE_SIZE, i_size ); + if( align_buf ) + { + /* Round up to the next huge page boundary if we are close enough. */ + size_t madv_size = (i_size + HUGE_PAGE_SIZE - HUGE_PAGE_THRESHOLD) & ~(HUGE_PAGE_SIZE-1); + madvise( align_buf, madv_size, MADV_HUGEPAGE ); + } + } + else +#undef HUGE_PAGE_SIZE +#undef HUGE_PAGE_THRESHOLD +#endif + align_buf = memalign( NATIVE_ALIGN, i_size ); #else uint8_t *buf = malloc( i_size + (NATIVE_ALIGN-1) + sizeof(void **) ); if( buf ) diff --git a/common/common.h b/common/common.h index c1d6a0c8..12e5763b 100644 --- a/common/common.h +++ b/common/common.h @@ -54,6 +54,31 @@ do {\ memset( var, 0, size );\ } while( 0 ) +/* Macros for merging multiple allocations into a single large malloc, for improved + * use with huge pages. */ + +/* Needs to be enough to contain any set of buffers that use combined allocations */ +#define PREALLOC_BUF_SIZE 1024 + +#define PREALLOC_INIT\ + int prealloc_idx = 0;\ + size_t prealloc_size = 0;\ + uint8_t **preallocs[PREALLOC_BUF_SIZE]; + +#define PREALLOC( var, size )\ +do {\ + var = (void*)prealloc_size;\ + preallocs[prealloc_idx++] = (uint8_t**)&var;\ + prealloc_size += ALIGN(size, NATIVE_ALIGN);\ +} while(0) + +#define PREALLOC_END( ptr )\ +do {\ + CHECKED_MALLOC( ptr, prealloc_size );\ + while( prealloc_idx-- )\ + *preallocs[prealloc_idx] += (intptr_t)ptr;\ +} while(0) + #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) #define X264_BFRAME_MAX 16 @@ -699,6 +724,7 @@ struct x264_t * and won't be copied from one thread to another */ /* mb table */ + uint8_t *base; /* base pointer for all malloced data in this mb */ int8_t *type; /* mb type */ uint8_t *partition; /* mb partition */ int8_t *qp; /* mb qp */ diff --git a/common/frame.c b/common/frame.c index e56da8ef..6203e3bf 100644 --- a/common/frame.c +++ b/common/frame.c @@ -86,6 +86,7 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) #endif CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) ); + PREALLOC_INIT /* allocate frame data (+64 for extra data for me) */ i_width = h->mb.i_mb_width*16; @@ -124,7 +125,7 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) for( int i = 0; i < h->param.i_bframe + 2; i++ ) for( int j = 0; j < h->param.i_bframe + 2; j++ ) - CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) ); + PREALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) ); frame->i_poc = -1; frame->i_type = X264_TYPE_AUTO; @@ -149,13 +150,9 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) { int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12); int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*chroma_padv)); - CHECKED_MALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) ); - frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH; + PREALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) ); if( PARAM_INTERLACED ) - { - CHECKED_MALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) ); - frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH; - } + PREALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) ); } /* all 4 luma planes allocated together, since the cacheline split code @@ -167,24 +164,15 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) if( h->param.analyse.i_subpel_refine && b_fdec ) { /* FIXME: Don't allocate both buffers in non-adaptive MBAFF. */ - CHECKED_MALLOC( frame->buffer[p], 4*luma_plane_size * sizeof(pixel) ); + PREALLOC( frame->buffer[p], 4*luma_plane_size * sizeof(pixel) ); if( PARAM_INTERLACED ) - CHECKED_MALLOC( frame->buffer_fld[p], 4*luma_plane_size * sizeof(pixel) ); - for( int i = 0; i < 4; i++ ) - { - frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH; - frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH; - } - frame->plane[p] = frame->filtered[p][0]; - frame->plane_fld[p] = frame->filtered_fld[p][0]; + PREALLOC( frame->buffer_fld[p], 4*luma_plane_size * sizeof(pixel) ); } else { - CHECKED_MALLOC( frame->buffer[p], luma_plane_size * sizeof(pixel) ); + PREALLOC( frame->buffer[p], luma_plane_size * sizeof(pixel) ); if( PARAM_INTERLACED ) - CHECKED_MALLOC( frame->buffer_fld[p], luma_plane_size * sizeof(pixel) ); - frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH; - frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH; + PREALLOC( frame->buffer_fld[p], luma_plane_size * sizeof(pixel) ); } } @@ -192,36 +180,30 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) if( b_fdec ) /* fdec frame */ { - CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t)); - CHECKED_MALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t)); - CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) ); - CHECKED_MALLOC( frame->mv16x16, 2*(i_mb_count+1) * sizeof(int16_t) ); - M32( frame->mv16x16[0] ) = 0; - frame->mv16x16++; - CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) ); + PREALLOC( frame->mb_type, i_mb_count * sizeof(int8_t) ); + PREALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t) ); + PREALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) ); + PREALLOC( frame->mv16x16, 2*(i_mb_count+1) * sizeof(int16_t) ); + PREALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) ); if( h->param.i_bframe ) { - CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) ); - CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) ); + PREALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) ); + PREALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) ); } else { frame->mv[1] = NULL; frame->ref[1] = NULL; } - CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) ); - CHECKED_MALLOC( frame->f_row_qp, i_lines/16 * sizeof(float) ); - CHECKED_MALLOC( frame->f_row_qscale, i_lines/16 * sizeof(float) ); + PREALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) ); + PREALLOC( frame->f_row_qp, i_lines/16 * sizeof(float) ); + PREALLOC( frame->f_row_qscale, i_lines/16 * sizeof(float) ); if( h->param.analyse.i_me_method >= X264_ME_ESA ) - { - CHECKED_MALLOC( frame->buffer[3], - frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa ); - frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH; - } + PREALLOC( frame->buffer[3], frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa ); if( PARAM_INTERLACED ) - CHECKED_MALLOC( frame->field, i_mb_count * sizeof(uint8_t) ); + PREALLOC( frame->field, i_mb_count * sizeof(uint8_t) ); if( h->param.analyse.b_mb_info ) - CHECKED_MALLOC( frame->effective_qp, i_mb_count * sizeof(uint8_t) ); + PREALLOC( frame->effective_qp, i_mb_count * sizeof(uint8_t) ); } else /* fenc frame */ { @@ -229,30 +211,85 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) { int luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign ); - CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size * sizeof(pixel) ); - for( int i = 0; i < 4; i++ ) - frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size; + PREALLOC( frame->buffer_lowres[0], 4 * luma_plane_size * sizeof(pixel) ); for( int j = 0; j <= !!h->param.i_bframe; j++ ) for( int i = 0; i <= h->param.i_bframe; i++ ) { - CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) ); - CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) ); + PREALLOC( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) ); + PREALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) ); } - CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) ); + PREALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) ); for( int j = 0; j <= h->param.i_bframe+1; j++ ) for( int i = 0; i <= h->param.i_bframe+1; i++ ) - CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) ); - frame->i_intra_cost = frame->lowres_costs[0][0]; - memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) ); + PREALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) ); + } if( h->param.rc.i_aq_mode ) { - CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) ); - CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) ); + PREALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) ); + PREALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) ); if( h->frames.b_have_lowres ) + PREALLOC( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) ); + } + } + + PREALLOC_END( frame->base ); + + if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 ) + { + int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12); + frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH; + if( PARAM_INTERLACED ) + frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH; + } + + for( int p = 0; p < luma_plane_count; p++ ) + { + int luma_plane_size = align_plane_size( frame->i_stride[p] * (frame->i_lines[p] + 2*i_padv), disalign ); + if( h->param.analyse.i_subpel_refine && b_fdec ) + { + for( int i = 0; i < 4; i++ ) + { + frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH; + frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH; + } + frame->plane[p] = frame->filtered[p][0]; + frame->plane_fld[p] = frame->filtered_fld[p][0]; + } + else + { + frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH; + frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH; + } + } + + if( b_fdec ) + { + M32( frame->mv16x16[0] ) = 0; + frame->mv16x16++; + + if( h->param.analyse.i_me_method >= X264_ME_ESA ) + frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH; + } + else + { + if( h->frames.b_have_lowres ) + { + int luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign ); + for( int i = 0; i < 4; i++ ) + frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size; + + for( int j = 0; j <= !!h->param.i_bframe; j++ ) + for( int i = 0; i <= h->param.i_bframe; i++ ) + memset( frame->lowres_mvs[j][i], 0, 2*h->mb.i_mb_count*sizeof(int16_t) ); + + frame->i_intra_cost = frame->lowres_costs[0][0]; + memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) ); + + if( h->param.rc.i_aq_mode ) /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */ - CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) ); + memset( frame->i_inv_qscale_factor, 0, (h->mb.i_mb_count+3) * sizeof(uint16_t) ); } } @@ -278,42 +315,8 @@ void x264_frame_delete( x264_frame_t *frame ) * so freeing those pointers would cause a double free later. */ if( !frame->b_duplicate ) { - for( int i = 0; i < 4; i++ ) - { - x264_free( frame->buffer[i] ); - x264_free( frame->buffer_fld[i] ); - } - for( int i = 0; i < 4; i++ ) - x264_free( frame->buffer_lowres[i] ); - for( int i = 0; i < X264_BFRAME_MAX+2; i++ ) - for( int j = 0; j < X264_BFRAME_MAX+2; j++ ) - x264_free( frame->i_row_satds[i][j] ); - for( int j = 0; j < 2; j++ ) - for( int i = 0; i <= X264_BFRAME_MAX; i++ ) - { - x264_free( frame->lowres_mvs[j][i] ); - x264_free( frame->lowres_mv_costs[j][i] ); - } - x264_free( frame->i_propagate_cost ); - for( int j = 0; j <= X264_BFRAME_MAX+1; j++ ) - for( int i = 0; i <= X264_BFRAME_MAX+1; i++ ) - x264_free( frame->lowres_costs[j][i] ); - x264_free( frame->f_qp_offset ); - x264_free( frame->f_qp_offset_aq ); - x264_free( frame->i_inv_qscale_factor ); - x264_free( frame->i_row_bits ); - x264_free( frame->f_row_qp ); - x264_free( frame->f_row_qscale ); - x264_free( frame->field ); - x264_free( frame->effective_qp ); - x264_free( frame->mb_type ); - x264_free( frame->mb_partition ); - x264_free( frame->mv[0] ); - x264_free( frame->mv[1] ); - if( frame->mv16x16 ) - x264_free( frame->mv16x16-1 ); - x264_free( frame->ref[0] ); - x264_free( frame->ref[1] ); + x264_free( frame->base ); + if( frame->param && frame->param->param_free ) frame->param->param_free( frame->param ); if( frame->mb_info_free ) diff --git a/common/frame.h b/common/frame.h index 72c1fa3a..d8416a5d 100644 --- a/common/frame.h +++ b/common/frame.h @@ -35,6 +35,7 @@ typedef struct x264_frame { /* */ + uint8_t *base; /* Base pointer for all malloced data in this frame. */ int i_poc; int i_delta_poc[2]; int i_type; diff --git a/common/macroblock.c b/common/macroblock.c index 2d3e7e14..45405fde 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -256,25 +256,26 @@ int x264_macroblock_cache_allocate( x264_t *h ) h->mb.b_interlaced = PARAM_INTERLACED; - CHECKED_MALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) ); - CHECKED_MALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) ); - CHECKED_MALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) ); - CHECKED_MALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) ); - memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) ); + PREALLOC_INIT + + PREALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) ); + PREALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) ); + PREALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) ); + PREALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) ); /* 0 -> 3 top(4), 4 -> 6 : left(3) */ - CHECKED_MALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) ); + PREALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) ); /* all coeffs */ - CHECKED_MALLOC( h->mb.non_zero_count, i_mb_count * 48 * sizeof(uint8_t) ); + PREALLOC( h->mb.non_zero_count, i_mb_count * 48 * sizeof(uint8_t) ); if( h->param.b_cabac ) { - CHECKED_MALLOC( h->mb.skipbp, i_mb_count * sizeof(int8_t) ); - CHECKED_MALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) ); - CHECKED_MALLOC( h->mb.mvd[0], i_mb_count * sizeof( **h->mb.mvd ) ); + PREALLOC( h->mb.skipbp, i_mb_count * sizeof(int8_t) ); + PREALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) ); + PREALLOC( h->mb.mvd[0], i_mb_count * sizeof( **h->mb.mvd ) ); if( h->param.i_bframe ) - CHECKED_MALLOC( h->mb.mvd[1], i_mb_count * sizeof( **h->mb.mvd ) ); + PREALLOC( h->mb.mvd[1], i_mb_count * sizeof( **h->mb.mvd ) ); } for( int i = 0; i < 2; i++ ) @@ -284,11 +285,7 @@ int x264_macroblock_cache_allocate( x264_t *h ) i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit for( int j = !i; j < i_refs; j++ ) - { - CHECKED_MALLOC( h->mb.mvr[i][j], 2 * (i_mb_count + 1) * sizeof(int16_t) ); - M32( h->mb.mvr[i][j][0] ) = 0; - h->mb.mvr[i][j]++; - } + PREALLOC( h->mb.mvr[i][j], 2 * (i_mb_count + 1) * sizeof(int16_t) ); } if( h->param.analyse.i_weighted_pred ) @@ -325,7 +322,24 @@ int x264_macroblock_cache_allocate( x264_t *h ) } for( int i = 0; i < numweightbuf; i++ ) - CHECKED_MALLOC( h->mb.p_weight_buf[i], luma_plane_size * sizeof(pixel) ); + PREALLOC( h->mb.p_weight_buf[i], luma_plane_size * sizeof(pixel) ); + } + + PREALLOC_END( h->mb.base ); + + memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) ); + + for( int i = 0; i < 2; i++ ) + { + int i_refs = X264_MIN(X264_REF_MAX, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << PARAM_INTERLACED; + if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ) + i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit + + for( int j = !i; j < i_refs; j++ ) + { + M32( h->mb.mvr[i][j][0] ) = 0; + h->mb.mvr[i][j]++; + } } return 0; @@ -334,26 +348,7 @@ fail: } void x264_macroblock_cache_free( x264_t *h ) { - for( int i = 0; i < 2; i++ ) - for( int j = !i; j < X264_REF_MAX*2; j++ ) - if( h->mb.mvr[i][j] ) - x264_free( h->mb.mvr[i][j]-1 ); - for( int i = 0; i < X264_REF_MAX; i++ ) - x264_free( h->mb.p_weight_buf[i] ); - - if( h->param.b_cabac ) - { - x264_free( h->mb.skipbp ); - x264_free( h->mb.chroma_pred_mode ); - x264_free( h->mb.mvd[0] ); - x264_free( h->mb.mvd[1] ); - } - x264_free( h->mb.slice_table ); - x264_free( h->mb.intra4x4_pred_mode ); - x264_free( h->mb.non_zero_count ); - x264_free( h->mb.mb_transform_size ); - x264_free( h->mb.cbp ); - x264_free( h->mb.qp ); + x264_free( h->mb.base ); } int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead ) diff --git a/configure b/configure index db1d9e77..6f3ac025 100755 --- a/configure +++ b/configure @@ -290,7 +290,8 @@ cross_prefix="" EXE="" # list of all preprocessor HAVE values we can define -CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL" +CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE \ + LAVF FFMS GPAC GF_MALLOC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP" # parse options @@ -820,6 +821,10 @@ if cc_check "math.h" "-Werror" "return log2f(2);" ; then define HAVE_LOG2F fi +if [ "$SYS" = "LINUX" -a \( "$ARCH" = "X86" -o "$ARCH" = "X86_64" \) ] && cc_check "sys/mman.h" "" "MADV_HUGEPAGE;" ; then + define HAVE_THP +fi + if [ "$vis" = "yes" ] ; then save_CFLAGS="$CFLAGS" CFLAGS="$CFLAGS -I/usr/X11R6/include" diff --git a/encoder/encoder.c b/encoder/encoder.c index 167daa9a..9ff8acf1 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -2717,7 +2717,7 @@ static void x264_thread_sync_context( x264_t *dst, x264_t *src ) x264_frame_push_unused( src, dst->fdec ); // copy everything except the per-thread pointers and the constants. - memcpy( &dst->i_frame, &src->i_frame, offsetof(x264_t, mb.type) - offsetof(x264_t, i_frame) ); + memcpy( &dst->i_frame, &src->i_frame, offsetof(x264_t, mb.base) - offsetof(x264_t, i_frame) ); dst->param = src->param; dst->stat = src->stat; dst->pixf = src->pixf; -- 2.39.2